Update readme.
This commit is contained in:
parent
9e38ccc33d
commit
c60820b660
|
@ -6,6 +6,9 @@ version: 0.1.0
|
|||
required_open_webui_version: 0.3.9
|
||||
"""
|
||||
|
||||
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
|
||||
|
||||
# System Imports
|
||||
import chromadb
|
||||
from chromadb import ClientAPI as ChromaAPI
|
||||
from chromadb import Collection as ChromaCollection
|
||||
|
|
55
memories.py
55
memories.py
|
@ -2,10 +2,16 @@
|
|||
title: Memory Filter
|
||||
author: projectmoon
|
||||
author_url: https://git.agnos.is/projectmoon/open-webui-filters
|
||||
version: 0.0.1
|
||||
required_open_webui_version: 0.3.8
|
||||
version: 0.0.2
|
||||
required_open_webui_version: 0.3.9
|
||||
"""
|
||||
|
||||
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
|
||||
#
|
||||
# Changelog:
|
||||
# 0.0.1 - Initial release, proof of concept
|
||||
# 0.0.2 - Slightly less hacky (but still hacky) way of getting chat IDs
|
||||
|
||||
# System imports
|
||||
import asyncio
|
||||
import hashlib
|
||||
|
@ -429,6 +435,24 @@ class Story(BaseModel):
|
|||
|
||||
|
||||
# Utils
|
||||
class SessionInfo(BaseModel):
|
||||
chat_id: str
|
||||
message_id: str
|
||||
session_id: str
|
||||
|
||||
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
|
||||
"""The latest innovation in hacky workarounds."""
|
||||
try:
|
||||
info = event_emitter.__closure__[0].cell_contents
|
||||
return SessionInfo(
|
||||
chat_id=info["chat_id"],
|
||||
message_id=info["message_id"],
|
||||
session_id=info["session_id"]
|
||||
)
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def create_enrichment_summary_prompt(
|
||||
narrative: str,
|
||||
character_details: List[str],
|
||||
|
@ -501,11 +525,6 @@ def create_context(results: SummarizerResponse) -> Optional[str]:
|
|||
return message
|
||||
|
||||
|
||||
def write_log(text):
|
||||
with open(f"/tmp/test-memories", "a") as file:
|
||||
file.write(text + "\n")
|
||||
|
||||
|
||||
def split_messages(messages, keep_amount):
|
||||
if len(messages) <= keep_amount:
|
||||
return messages[:], []
|
||||
|
@ -621,17 +640,18 @@ class Filter:
|
|||
__event_emitter__: Callable[[Any], Awaitable[None]],
|
||||
) -> dict:
|
||||
# Useful things to have around.
|
||||
self.session_info = extract_session_info(__event_emitter__)
|
||||
self.event_emitter = __event_emitter__
|
||||
self.summarizer_model_id = self.valves.summarizer_model(body)
|
||||
|
||||
await self.send_outlet_status(__event_emitter__, False)
|
||||
messages = body['messages']
|
||||
convo_id = self.extract_convo_id(messages)
|
||||
|
||||
# summarize into plot points.
|
||||
summary = await self.summarize(messages)
|
||||
story = Story(
|
||||
convo_id=convo_id, client=CHROMA_CLIENT,
|
||||
convo_id=self.session_info.chat_id,
|
||||
client=CHROMA_CLIENT,
|
||||
embedding_func=EMBEDDING_FUNCTION,
|
||||
messages=messages
|
||||
)
|
||||
|
@ -693,7 +713,10 @@ class Filter:
|
|||
return await summarizer.summarize()
|
||||
|
||||
|
||||
async def enrich(self, story: Story, messages) -> SummarizerResponse:
|
||||
async def enrich(self, story: Story, messages) -> Optional[SummarizerResponse]:
|
||||
if len(messages) < 2:
|
||||
return None
|
||||
|
||||
await self.set_enriching_status("searching")
|
||||
query_generation_result = await self.generate_enrichment_queries(messages)
|
||||
character_results = [result
|
||||
|
@ -710,7 +733,8 @@ class Filter:
|
|||
|
||||
async def update_system_message(self, messages, system_message):
|
||||
story = Story(
|
||||
convo_id=None, client=CHROMA_CLIENT,
|
||||
convo_id=self.session_info.chat_id,
|
||||
client=CHROMA_CLIENT,
|
||||
embedding_func=EMBEDDING_FUNCTION,
|
||||
messages=messages
|
||||
)
|
||||
|
@ -720,8 +744,11 @@ class Filter:
|
|||
if story.convo_id == "<unset>":
|
||||
return
|
||||
|
||||
enrichment_summary: SummarizerResponse = await self.enrich(story, messages)
|
||||
context = create_context(enrichment_summary)
|
||||
enrichment_summary: Optional[SummarizerResponse] = await self.enrich(story, messages)
|
||||
if enrichment_summary:
|
||||
context = create_context(enrichment_summary)
|
||||
else:
|
||||
context = None
|
||||
|
||||
if context:
|
||||
system_message["content"] += context
|
||||
|
@ -734,8 +761,10 @@ class Filter:
|
|||
__event_emitter__: Callable[[Any], Awaitable[None]]
|
||||
) -> dict:
|
||||
# Useful properties to have around.
|
||||
self.session_info = extract_session_info(__event_emitter__)
|
||||
self.event_emitter = __event_emitter__
|
||||
self.summarizer_model_id = self.valves.summarizer_model(body)
|
||||
|
||||
await self.set_enriching_status("init")
|
||||
messages = body["messages"]
|
||||
|
||||
|
|
45
readme.md
45
readme.md
|
@ -1,4 +1,16 @@
|
|||
# Memory Filter
|
||||
# OpenWebUI Filters
|
||||
|
||||
My collection of OpenWebUI Filters.
|
||||
|
||||
So far:
|
||||
|
||||
- **Memory Filter:** A basic narrative memory filter intended for
|
||||
long-form storytelling/roleplaying scenarios. Intended as a proof
|
||||
of concept/springboard for more advanced narrative memory.
|
||||
- **GPU Scaling Filter:** Reduce number of GPU layers in use if Ollama
|
||||
crashes due to running out of VRAM.
|
||||
|
||||
## Memory Filter
|
||||
|
||||
Super hacky, very basic automatic narrative memory filter for
|
||||
OpenWebUI, that may or may not actually enhance narrative generation!
|
||||
|
@ -9,7 +21,7 @@ developments in long form story writing/roleplaying scenarios, where
|
|||
context window length is limited (or ollama crashes on long context
|
||||
length models despite having 40 GB of unused memory!).
|
||||
|
||||
## Configuration
|
||||
### Configuration
|
||||
|
||||
The filter exposes two settings:
|
||||
|
||||
|
@ -30,7 +42,7 @@ The filter hooks in to OpenWebUI's RAG settings to generate embeddings
|
|||
and query the vector database. The filter will use the same embedding
|
||||
model and ChromaDB instance that's configured in the admin settings.
|
||||
|
||||
## Usage
|
||||
### Usage
|
||||
|
||||
Enable the filter on a model that you want to use to generate stories.
|
||||
It is recommended, although not required, that this be the same model
|
||||
|
@ -47,7 +59,7 @@ filter is doing.
|
|||
Do not reply while the model is updating its knowledge base or funny
|
||||
things might happen.
|
||||
|
||||
## Functioning
|
||||
### Function
|
||||
|
||||
What does it do?
|
||||
- When receiving user input, generate search queries for vector DB
|
||||
|
@ -60,7 +72,7 @@ What does it do?
|
|||
- After receiving model narrative reply, generate character and plot
|
||||
info and stick them into the vector DB.
|
||||
|
||||
## Limitations and Known Issues
|
||||
### Limitations and Known Issues
|
||||
|
||||
What does it not do?
|
||||
- Handle conversational branching/regeneration. In fact, this will
|
||||
|
@ -68,10 +80,10 @@ What does it not do?
|
|||
- Bouncing around some ideas to fix this. Basically requires
|
||||
building a "canonical" branching story path in the database?
|
||||
- Proper context "chapter" summarization (planned to change).
|
||||
- Work properly when switching conversations due to OpenWebUI
|
||||
- ~~Work properly when switching conversations due to OpenWebUI
|
||||
limitations. The chat ID is not available on incoming requests for
|
||||
some reason, so a janky workaround is used when processing LLM
|
||||
responses.
|
||||
responses.~~ Fixed! (but still in a very hacky way)
|
||||
- Clear out information of old conversations or expire irrelevant
|
||||
data.
|
||||
|
||||
|
@ -84,6 +96,23 @@ Other things to do or improve:
|
|||
used, so multiple users = concurrency issues.
|
||||
- Block user input while updating the knowledgebase.
|
||||
|
||||
## License
|
||||
## GPU Scaling Filter
|
||||
|
||||
This is a simple filter that reduces the number of GPU layers in use
|
||||
by Ollama when it detects that Ollama has crashed (via empty response
|
||||
coming in to OpenWebUI). Right now, the logic is very basic, just
|
||||
using static numbers to reduce GPU layer counts. It doesn't take into
|
||||
account the number of layers in models or dynamically monitor VRAM
|
||||
use.
|
||||
|
||||
There are three settings:
|
||||
- **Initial Reduction:** Number of layers to immediately set when an
|
||||
Ollama crash is detected. Defaults to 20.
|
||||
- **Scaling Step:** Number of layers to reduce by on subsequent crashes
|
||||
(down to a minimum of 0, i.e. 100% CPU inference). Defaults to 5.
|
||||
- **Show Status:** Whether or not to inform the user that the
|
||||
conversation is running slower due to GPU layer downscaling.
|
||||
|
||||
# License
|
||||
|
||||
AGPL v3.0+.
|
||||
|
|
Loading…
Reference in New Issue