Update readme.

This commit is contained in:
projectmoon 2024-07-18 22:13:00 +02:00
parent 9e38ccc33d
commit c60820b660
3 changed files with 82 additions and 21 deletions

View File

@ -6,6 +6,9 @@ version: 0.1.0
required_open_webui_version: 0.3.9 required_open_webui_version: 0.3.9
""" """
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
# System Imports
import chromadb import chromadb
from chromadb import ClientAPI as ChromaAPI from chromadb import ClientAPI as ChromaAPI
from chromadb import Collection as ChromaCollection from chromadb import Collection as ChromaCollection

View File

@ -2,10 +2,16 @@
title: Memory Filter title: Memory Filter
author: projectmoon author: projectmoon
author_url: https://git.agnos.is/projectmoon/open-webui-filters author_url: https://git.agnos.is/projectmoon/open-webui-filters
version: 0.0.1 version: 0.0.2
required_open_webui_version: 0.3.8 required_open_webui_version: 0.3.9
""" """
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
#
# Changelog:
# 0.0.1 - Initial release, proof of concept
# 0.0.2 - Slightly less hacky (but still hacky) way of getting chat IDs
# System imports # System imports
import asyncio import asyncio
import hashlib import hashlib
@ -429,6 +435,24 @@ class Story(BaseModel):
# Utils # Utils
class SessionInfo(BaseModel):
chat_id: str
message_id: str
session_id: str
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
"""The latest innovation in hacky workarounds."""
try:
info = event_emitter.__closure__[0].cell_contents
return SessionInfo(
chat_id=info["chat_id"],
message_id=info["message_id"],
session_id=info["session_id"]
)
except:
return None
def create_enrichment_summary_prompt( def create_enrichment_summary_prompt(
narrative: str, narrative: str,
character_details: List[str], character_details: List[str],
@ -501,11 +525,6 @@ def create_context(results: SummarizerResponse) -> Optional[str]:
return message return message
def write_log(text):
with open(f"/tmp/test-memories", "a") as file:
file.write(text + "\n")
def split_messages(messages, keep_amount): def split_messages(messages, keep_amount):
if len(messages) <= keep_amount: if len(messages) <= keep_amount:
return messages[:], [] return messages[:], []
@ -621,17 +640,18 @@ class Filter:
__event_emitter__: Callable[[Any], Awaitable[None]], __event_emitter__: Callable[[Any], Awaitable[None]],
) -> dict: ) -> dict:
# Useful things to have around. # Useful things to have around.
self.session_info = extract_session_info(__event_emitter__)
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.summarizer_model_id = self.valves.summarizer_model(body) self.summarizer_model_id = self.valves.summarizer_model(body)
await self.send_outlet_status(__event_emitter__, False) await self.send_outlet_status(__event_emitter__, False)
messages = body['messages'] messages = body['messages']
convo_id = self.extract_convo_id(messages)
# summarize into plot points. # summarize into plot points.
summary = await self.summarize(messages) summary = await self.summarize(messages)
story = Story( story = Story(
convo_id=convo_id, client=CHROMA_CLIENT, convo_id=self.session_info.chat_id,
client=CHROMA_CLIENT,
embedding_func=EMBEDDING_FUNCTION, embedding_func=EMBEDDING_FUNCTION,
messages=messages messages=messages
) )
@ -693,7 +713,10 @@ class Filter:
return await summarizer.summarize() return await summarizer.summarize()
async def enrich(self, story: Story, messages) -> SummarizerResponse: async def enrich(self, story: Story, messages) -> Optional[SummarizerResponse]:
if len(messages) < 2:
return None
await self.set_enriching_status("searching") await self.set_enriching_status("searching")
query_generation_result = await self.generate_enrichment_queries(messages) query_generation_result = await self.generate_enrichment_queries(messages)
character_results = [result character_results = [result
@ -710,7 +733,8 @@ class Filter:
async def update_system_message(self, messages, system_message): async def update_system_message(self, messages, system_message):
story = Story( story = Story(
convo_id=None, client=CHROMA_CLIENT, convo_id=self.session_info.chat_id,
client=CHROMA_CLIENT,
embedding_func=EMBEDDING_FUNCTION, embedding_func=EMBEDDING_FUNCTION,
messages=messages messages=messages
) )
@ -720,8 +744,11 @@ class Filter:
if story.convo_id == "<unset>": if story.convo_id == "<unset>":
return return
enrichment_summary: SummarizerResponse = await self.enrich(story, messages) enrichment_summary: Optional[SummarizerResponse] = await self.enrich(story, messages)
context = create_context(enrichment_summary) if enrichment_summary:
context = create_context(enrichment_summary)
else:
context = None
if context: if context:
system_message["content"] += context system_message["content"] += context
@ -734,8 +761,10 @@ class Filter:
__event_emitter__: Callable[[Any], Awaitable[None]] __event_emitter__: Callable[[Any], Awaitable[None]]
) -> dict: ) -> dict:
# Useful properties to have around. # Useful properties to have around.
self.session_info = extract_session_info(__event_emitter__)
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.summarizer_model_id = self.valves.summarizer_model(body) self.summarizer_model_id = self.valves.summarizer_model(body)
await self.set_enriching_status("init") await self.set_enriching_status("init")
messages = body["messages"] messages = body["messages"]

View File

@ -1,4 +1,16 @@
# Memory Filter # OpenWebUI Filters
My collection of OpenWebUI Filters.
So far:
- **Memory Filter:** A basic narrative memory filter intended for
long-form storytelling/roleplaying scenarios. Intended as a proof
of concept/springboard for more advanced narrative memory.
- **GPU Scaling Filter:** Reduce number of GPU layers in use if Ollama
crashes due to running out of VRAM.
## Memory Filter
Super hacky, very basic automatic narrative memory filter for Super hacky, very basic automatic narrative memory filter for
OpenWebUI, that may or may not actually enhance narrative generation! OpenWebUI, that may or may not actually enhance narrative generation!
@ -9,7 +21,7 @@ developments in long form story writing/roleplaying scenarios, where
context window length is limited (or ollama crashes on long context context window length is limited (or ollama crashes on long context
length models despite having 40 GB of unused memory!). length models despite having 40 GB of unused memory!).
## Configuration ### Configuration
The filter exposes two settings: The filter exposes two settings:
@ -30,7 +42,7 @@ The filter hooks in to OpenWebUI's RAG settings to generate embeddings
and query the vector database. The filter will use the same embedding and query the vector database. The filter will use the same embedding
model and ChromaDB instance that's configured in the admin settings. model and ChromaDB instance that's configured in the admin settings.
## Usage ### Usage
Enable the filter on a model that you want to use to generate stories. Enable the filter on a model that you want to use to generate stories.
It is recommended, although not required, that this be the same model It is recommended, although not required, that this be the same model
@ -47,7 +59,7 @@ filter is doing.
Do not reply while the model is updating its knowledge base or funny Do not reply while the model is updating its knowledge base or funny
things might happen. things might happen.
## Functioning ### Function
What does it do? What does it do?
- When receiving user input, generate search queries for vector DB - When receiving user input, generate search queries for vector DB
@ -60,7 +72,7 @@ What does it do?
- After receiving model narrative reply, generate character and plot - After receiving model narrative reply, generate character and plot
info and stick them into the vector DB. info and stick them into the vector DB.
## Limitations and Known Issues ### Limitations and Known Issues
What does it not do? What does it not do?
- Handle conversational branching/regeneration. In fact, this will - Handle conversational branching/regeneration. In fact, this will
@ -68,10 +80,10 @@ What does it not do?
- Bouncing around some ideas to fix this. Basically requires - Bouncing around some ideas to fix this. Basically requires
building a "canonical" branching story path in the database? building a "canonical" branching story path in the database?
- Proper context "chapter" summarization (planned to change). - Proper context "chapter" summarization (planned to change).
- Work properly when switching conversations due to OpenWebUI - ~~Work properly when switching conversations due to OpenWebUI
limitations. The chat ID is not available on incoming requests for limitations. The chat ID is not available on incoming requests for
some reason, so a janky workaround is used when processing LLM some reason, so a janky workaround is used when processing LLM
responses. responses.~~ Fixed! (but still in a very hacky way)
- Clear out information of old conversations or expire irrelevant - Clear out information of old conversations or expire irrelevant
data. data.
@ -84,6 +96,23 @@ Other things to do or improve:
used, so multiple users = concurrency issues. used, so multiple users = concurrency issues.
- Block user input while updating the knowledgebase. - Block user input while updating the knowledgebase.
## License ## GPU Scaling Filter
This is a simple filter that reduces the number of GPU layers in use
by Ollama when it detects that Ollama has crashed (via empty response
coming in to OpenWebUI). Right now, the logic is very basic, just
using static numbers to reduce GPU layer counts. It doesn't take into
account the number of layers in models or dynamically monitor VRAM
use.
There are three settings:
- **Initial Reduction:** Number of layers to immediately set when an
Ollama crash is detected. Defaults to 20.
- **Scaling Step:** Number of layers to reduce by on subsequent crashes
(down to a minimum of 0, i.e. 100% CPU inference). Defaults to 5.
- **Show Status:** Whether or not to inform the user that the
conversation is running slower due to GPU layer downscaling.
# License
AGPL v3.0+. AGPL v3.0+.