Update readme.
This commit is contained in:
parent
9e38ccc33d
commit
c60820b660
|
@ -6,6 +6,9 @@ version: 0.1.0
|
||||||
required_open_webui_version: 0.3.9
|
required_open_webui_version: 0.3.9
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
|
||||||
|
|
||||||
|
# System Imports
|
||||||
import chromadb
|
import chromadb
|
||||||
from chromadb import ClientAPI as ChromaAPI
|
from chromadb import ClientAPI as ChromaAPI
|
||||||
from chromadb import Collection as ChromaCollection
|
from chromadb import Collection as ChromaCollection
|
||||||
|
|
55
memories.py
55
memories.py
|
@ -2,10 +2,16 @@
|
||||||
title: Memory Filter
|
title: Memory Filter
|
||||||
author: projectmoon
|
author: projectmoon
|
||||||
author_url: https://git.agnos.is/projectmoon/open-webui-filters
|
author_url: https://git.agnos.is/projectmoon/open-webui-filters
|
||||||
version: 0.0.1
|
version: 0.0.2
|
||||||
required_open_webui_version: 0.3.8
|
required_open_webui_version: 0.3.9
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
|
||||||
|
#
|
||||||
|
# Changelog:
|
||||||
|
# 0.0.1 - Initial release, proof of concept
|
||||||
|
# 0.0.2 - Slightly less hacky (but still hacky) way of getting chat IDs
|
||||||
|
|
||||||
# System imports
|
# System imports
|
||||||
import asyncio
|
import asyncio
|
||||||
import hashlib
|
import hashlib
|
||||||
|
@ -429,6 +435,24 @@ class Story(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
# Utils
|
# Utils
|
||||||
|
class SessionInfo(BaseModel):
|
||||||
|
chat_id: str
|
||||||
|
message_id: str
|
||||||
|
session_id: str
|
||||||
|
|
||||||
|
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
|
||||||
|
"""The latest innovation in hacky workarounds."""
|
||||||
|
try:
|
||||||
|
info = event_emitter.__closure__[0].cell_contents
|
||||||
|
return SessionInfo(
|
||||||
|
chat_id=info["chat_id"],
|
||||||
|
message_id=info["message_id"],
|
||||||
|
session_id=info["session_id"]
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def create_enrichment_summary_prompt(
|
def create_enrichment_summary_prompt(
|
||||||
narrative: str,
|
narrative: str,
|
||||||
character_details: List[str],
|
character_details: List[str],
|
||||||
|
@ -501,11 +525,6 @@ def create_context(results: SummarizerResponse) -> Optional[str]:
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
|
||||||
def write_log(text):
|
|
||||||
with open(f"/tmp/test-memories", "a") as file:
|
|
||||||
file.write(text + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def split_messages(messages, keep_amount):
|
def split_messages(messages, keep_amount):
|
||||||
if len(messages) <= keep_amount:
|
if len(messages) <= keep_amount:
|
||||||
return messages[:], []
|
return messages[:], []
|
||||||
|
@ -621,17 +640,18 @@ class Filter:
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]],
|
__event_emitter__: Callable[[Any], Awaitable[None]],
|
||||||
) -> dict:
|
) -> dict:
|
||||||
# Useful things to have around.
|
# Useful things to have around.
|
||||||
|
self.session_info = extract_session_info(__event_emitter__)
|
||||||
self.event_emitter = __event_emitter__
|
self.event_emitter = __event_emitter__
|
||||||
self.summarizer_model_id = self.valves.summarizer_model(body)
|
self.summarizer_model_id = self.valves.summarizer_model(body)
|
||||||
|
|
||||||
await self.send_outlet_status(__event_emitter__, False)
|
await self.send_outlet_status(__event_emitter__, False)
|
||||||
messages = body['messages']
|
messages = body['messages']
|
||||||
convo_id = self.extract_convo_id(messages)
|
|
||||||
|
|
||||||
# summarize into plot points.
|
# summarize into plot points.
|
||||||
summary = await self.summarize(messages)
|
summary = await self.summarize(messages)
|
||||||
story = Story(
|
story = Story(
|
||||||
convo_id=convo_id, client=CHROMA_CLIENT,
|
convo_id=self.session_info.chat_id,
|
||||||
|
client=CHROMA_CLIENT,
|
||||||
embedding_func=EMBEDDING_FUNCTION,
|
embedding_func=EMBEDDING_FUNCTION,
|
||||||
messages=messages
|
messages=messages
|
||||||
)
|
)
|
||||||
|
@ -693,7 +713,10 @@ class Filter:
|
||||||
return await summarizer.summarize()
|
return await summarizer.summarize()
|
||||||
|
|
||||||
|
|
||||||
async def enrich(self, story: Story, messages) -> SummarizerResponse:
|
async def enrich(self, story: Story, messages) -> Optional[SummarizerResponse]:
|
||||||
|
if len(messages) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
await self.set_enriching_status("searching")
|
await self.set_enriching_status("searching")
|
||||||
query_generation_result = await self.generate_enrichment_queries(messages)
|
query_generation_result = await self.generate_enrichment_queries(messages)
|
||||||
character_results = [result
|
character_results = [result
|
||||||
|
@ -710,7 +733,8 @@ class Filter:
|
||||||
|
|
||||||
async def update_system_message(self, messages, system_message):
|
async def update_system_message(self, messages, system_message):
|
||||||
story = Story(
|
story = Story(
|
||||||
convo_id=None, client=CHROMA_CLIENT,
|
convo_id=self.session_info.chat_id,
|
||||||
|
client=CHROMA_CLIENT,
|
||||||
embedding_func=EMBEDDING_FUNCTION,
|
embedding_func=EMBEDDING_FUNCTION,
|
||||||
messages=messages
|
messages=messages
|
||||||
)
|
)
|
||||||
|
@ -720,8 +744,11 @@ class Filter:
|
||||||
if story.convo_id == "<unset>":
|
if story.convo_id == "<unset>":
|
||||||
return
|
return
|
||||||
|
|
||||||
enrichment_summary: SummarizerResponse = await self.enrich(story, messages)
|
enrichment_summary: Optional[SummarizerResponse] = await self.enrich(story, messages)
|
||||||
context = create_context(enrichment_summary)
|
if enrichment_summary:
|
||||||
|
context = create_context(enrichment_summary)
|
||||||
|
else:
|
||||||
|
context = None
|
||||||
|
|
||||||
if context:
|
if context:
|
||||||
system_message["content"] += context
|
system_message["content"] += context
|
||||||
|
@ -734,8 +761,10 @@ class Filter:
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]]
|
__event_emitter__: Callable[[Any], Awaitable[None]]
|
||||||
) -> dict:
|
) -> dict:
|
||||||
# Useful properties to have around.
|
# Useful properties to have around.
|
||||||
|
self.session_info = extract_session_info(__event_emitter__)
|
||||||
self.event_emitter = __event_emitter__
|
self.event_emitter = __event_emitter__
|
||||||
self.summarizer_model_id = self.valves.summarizer_model(body)
|
self.summarizer_model_id = self.valves.summarizer_model(body)
|
||||||
|
|
||||||
await self.set_enriching_status("init")
|
await self.set_enriching_status("init")
|
||||||
messages = body["messages"]
|
messages = body["messages"]
|
||||||
|
|
||||||
|
|
45
readme.md
45
readme.md
|
@ -1,4 +1,16 @@
|
||||||
# Memory Filter
|
# OpenWebUI Filters
|
||||||
|
|
||||||
|
My collection of OpenWebUI Filters.
|
||||||
|
|
||||||
|
So far:
|
||||||
|
|
||||||
|
- **Memory Filter:** A basic narrative memory filter intended for
|
||||||
|
long-form storytelling/roleplaying scenarios. Intended as a proof
|
||||||
|
of concept/springboard for more advanced narrative memory.
|
||||||
|
- **GPU Scaling Filter:** Reduce number of GPU layers in use if Ollama
|
||||||
|
crashes due to running out of VRAM.
|
||||||
|
|
||||||
|
## Memory Filter
|
||||||
|
|
||||||
Super hacky, very basic automatic narrative memory filter for
|
Super hacky, very basic automatic narrative memory filter for
|
||||||
OpenWebUI, that may or may not actually enhance narrative generation!
|
OpenWebUI, that may or may not actually enhance narrative generation!
|
||||||
|
@ -9,7 +21,7 @@ developments in long form story writing/roleplaying scenarios, where
|
||||||
context window length is limited (or ollama crashes on long context
|
context window length is limited (or ollama crashes on long context
|
||||||
length models despite having 40 GB of unused memory!).
|
length models despite having 40 GB of unused memory!).
|
||||||
|
|
||||||
## Configuration
|
### Configuration
|
||||||
|
|
||||||
The filter exposes two settings:
|
The filter exposes two settings:
|
||||||
|
|
||||||
|
@ -30,7 +42,7 @@ The filter hooks in to OpenWebUI's RAG settings to generate embeddings
|
||||||
and query the vector database. The filter will use the same embedding
|
and query the vector database. The filter will use the same embedding
|
||||||
model and ChromaDB instance that's configured in the admin settings.
|
model and ChromaDB instance that's configured in the admin settings.
|
||||||
|
|
||||||
## Usage
|
### Usage
|
||||||
|
|
||||||
Enable the filter on a model that you want to use to generate stories.
|
Enable the filter on a model that you want to use to generate stories.
|
||||||
It is recommended, although not required, that this be the same model
|
It is recommended, although not required, that this be the same model
|
||||||
|
@ -47,7 +59,7 @@ filter is doing.
|
||||||
Do not reply while the model is updating its knowledge base or funny
|
Do not reply while the model is updating its knowledge base or funny
|
||||||
things might happen.
|
things might happen.
|
||||||
|
|
||||||
## Functioning
|
### Function
|
||||||
|
|
||||||
What does it do?
|
What does it do?
|
||||||
- When receiving user input, generate search queries for vector DB
|
- When receiving user input, generate search queries for vector DB
|
||||||
|
@ -60,7 +72,7 @@ What does it do?
|
||||||
- After receiving model narrative reply, generate character and plot
|
- After receiving model narrative reply, generate character and plot
|
||||||
info and stick them into the vector DB.
|
info and stick them into the vector DB.
|
||||||
|
|
||||||
## Limitations and Known Issues
|
### Limitations and Known Issues
|
||||||
|
|
||||||
What does it not do?
|
What does it not do?
|
||||||
- Handle conversational branching/regeneration. In fact, this will
|
- Handle conversational branching/regeneration. In fact, this will
|
||||||
|
@ -68,10 +80,10 @@ What does it not do?
|
||||||
- Bouncing around some ideas to fix this. Basically requires
|
- Bouncing around some ideas to fix this. Basically requires
|
||||||
building a "canonical" branching story path in the database?
|
building a "canonical" branching story path in the database?
|
||||||
- Proper context "chapter" summarization (planned to change).
|
- Proper context "chapter" summarization (planned to change).
|
||||||
- Work properly when switching conversations due to OpenWebUI
|
- ~~Work properly when switching conversations due to OpenWebUI
|
||||||
limitations. The chat ID is not available on incoming requests for
|
limitations. The chat ID is not available on incoming requests for
|
||||||
some reason, so a janky workaround is used when processing LLM
|
some reason, so a janky workaround is used when processing LLM
|
||||||
responses.
|
responses.~~ Fixed! (but still in a very hacky way)
|
||||||
- Clear out information of old conversations or expire irrelevant
|
- Clear out information of old conversations or expire irrelevant
|
||||||
data.
|
data.
|
||||||
|
|
||||||
|
@ -84,6 +96,23 @@ Other things to do or improve:
|
||||||
used, so multiple users = concurrency issues.
|
used, so multiple users = concurrency issues.
|
||||||
- Block user input while updating the knowledgebase.
|
- Block user input while updating the knowledgebase.
|
||||||
|
|
||||||
## License
|
## GPU Scaling Filter
|
||||||
|
|
||||||
|
This is a simple filter that reduces the number of GPU layers in use
|
||||||
|
by Ollama when it detects that Ollama has crashed (via empty response
|
||||||
|
coming in to OpenWebUI). Right now, the logic is very basic, just
|
||||||
|
using static numbers to reduce GPU layer counts. It doesn't take into
|
||||||
|
account the number of layers in models or dynamically monitor VRAM
|
||||||
|
use.
|
||||||
|
|
||||||
|
There are three settings:
|
||||||
|
- **Initial Reduction:** Number of layers to immediately set when an
|
||||||
|
Ollama crash is detected. Defaults to 20.
|
||||||
|
- **Scaling Step:** Number of layers to reduce by on subsequent crashes
|
||||||
|
(down to a minimum of 0, i.e. 100% CPU inference). Defaults to 5.
|
||||||
|
- **Show Status:** Whether or not to inform the user that the
|
||||||
|
conversation is running slower due to GPU layer downscaling.
|
||||||
|
|
||||||
|
# License
|
||||||
|
|
||||||
AGPL v3.0+.
|
AGPL v3.0+.
|
||||||
|
|
Loading…
Reference in New Issue