From c60820b660abc37ad8c8b17cffc3b3a33eb7f1a1 Mon Sep 17 00:00:00 2001
From: projectmoon <projectmoon@agnos.is>
Date: Thu, 18 Jul 2024 22:13:00 +0200
Subject: [PATCH] Update readme.

---
 gpu_layer_scaler.py |  3 +++
 memories.py         | 55 ++++++++++++++++++++++++++++++++++-----------
 readme.md           | 45 ++++++++++++++++++++++++++++++-------
 3 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/gpu_layer_scaler.py b/gpu_layer_scaler.py
index 359ea15..e91e21e 100644
--- a/gpu_layer_scaler.py
+++ b/gpu_layer_scaler.py
@@ -6,6 +6,9 @@ version: 0.1.0
 required_open_webui_version: 0.3.9
 """
 
+# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
+
+# System Imports
 import chromadb
 from chromadb import ClientAPI as ChromaAPI
 from chromadb import Collection as ChromaCollection
diff --git a/memories.py b/memories.py
index 3936466..71d30e4 100644
--- a/memories.py
+++ b/memories.py
@@ -2,10 +2,16 @@
 title: Memory Filter
 author: projectmoon
 author_url: https://git.agnos.is/projectmoon/open-webui-filters
-version: 0.0.1
-required_open_webui_version: 0.3.8
+version: 0.0.2
+required_open_webui_version: 0.3.9
 """
 
+# Documentation: https://git.agnos.is/projectmoon/open-webui-filters
+#
+# Changelog:
+#  0.0.1 - Initial release, proof of concept
+#  0.0.2 - Slightly less hacky (but still hacky) way of getting chat IDs
+
 # System imports
 import asyncio
 import hashlib
@@ -429,6 +435,24 @@ class Story(BaseModel):
 
 
 # Utils
+class SessionInfo(BaseModel):
+    chat_id: str
+    message_id: str
+    session_id: str
+
+def extract_session_info(event_emitter) -> Optional[SessionInfo]:
+    """The latest innovation in hacky workarounds."""
+    try:
+        info = event_emitter.__closure__[0].cell_contents
+        return SessionInfo(
+            chat_id=info["chat_id"],
+            message_id=info["message_id"],
+            session_id=info["session_id"]
+        )
+    except:
+        return None
+
+
 def create_enrichment_summary_prompt(
         narrative: str,
         character_details: List[str],
@@ -501,11 +525,6 @@ def create_context(results: SummarizerResponse) -> Optional[str]:
     return message
 
 
-def write_log(text):
-    with open(f"/tmp/test-memories", "a") as file:
-        file.write(text + "\n")
-
-
 def split_messages(messages, keep_amount):
     if len(messages) <= keep_amount:
         return messages[:], []
@@ -621,17 +640,18 @@ class Filter:
         __event_emitter__: Callable[[Any], Awaitable[None]],
     ) -> dict:
         # Useful things to have around.
+        self.session_info = extract_session_info(__event_emitter__)
         self.event_emitter = __event_emitter__
         self.summarizer_model_id = self.valves.summarizer_model(body)
 
         await self.send_outlet_status(__event_emitter__, False)
         messages = body['messages']
-        convo_id = self.extract_convo_id(messages)
 
         # summarize into plot points.
         summary = await self.summarize(messages)
         story = Story(
-            convo_id=convo_id, client=CHROMA_CLIENT,
+            convo_id=self.session_info.chat_id,
+            client=CHROMA_CLIENT,
             embedding_func=EMBEDDING_FUNCTION,
             messages=messages
         )
@@ -693,7 +713,10 @@ class Filter:
         return await summarizer.summarize()
 
 
-    async def enrich(self, story: Story, messages) -> SummarizerResponse:
+    async def enrich(self, story: Story, messages) -> Optional[SummarizerResponse]:
+        if len(messages) < 2:
+            return None
+
         await self.set_enriching_status("searching")
         query_generation_result = await self.generate_enrichment_queries(messages)
         character_results = [result
@@ -710,7 +733,8 @@ class Filter:
 
     async def update_system_message(self, messages, system_message):
         story = Story(
-            convo_id=None, client=CHROMA_CLIENT,
+            convo_id=self.session_info.chat_id,
+            client=CHROMA_CLIENT,
             embedding_func=EMBEDDING_FUNCTION,
             messages=messages
         )
@@ -720,8 +744,11 @@ class Filter:
         if story.convo_id == "<unset>":
             return
 
-        enrichment_summary: SummarizerResponse = await self.enrich(story, messages)
-        context = create_context(enrichment_summary)
+        enrichment_summary: Optional[SummarizerResponse] = await self.enrich(story, messages)
+        if enrichment_summary:
+            context = create_context(enrichment_summary)
+        else:
+            context = None
 
         if context:
             system_message["content"] += context
@@ -734,8 +761,10 @@ class Filter:
             __event_emitter__: Callable[[Any], Awaitable[None]]
     ) -> dict:
         # Useful properties to have around.
+        self.session_info = extract_session_info(__event_emitter__)
         self.event_emitter = __event_emitter__
         self.summarizer_model_id = self.valves.summarizer_model(body)
+
         await self.set_enriching_status("init")
         messages = body["messages"]
 
diff --git a/readme.md b/readme.md
index 8990589..f99a93e 100644
--- a/readme.md
+++ b/readme.md
@@ -1,4 +1,16 @@
-# Memory Filter
+# OpenWebUI Filters
+
+My collection of OpenWebUI Filters.
+
+So far:
+
+ - **Memory Filter:** A basic narrative memory filter intended for
+   long-form storytelling/roleplaying scenarios. Intended as a proof
+   of concept/springboard for more advanced narrative memory.
+ - **GPU Scaling Filter:** Reduce number of GPU layers in use if Ollama
+   crashes due to running out of VRAM.
+
+## Memory Filter
 
 Super hacky, very basic automatic narrative memory filter for
 OpenWebUI, that may or may not actually enhance narrative generation!
@@ -9,7 +21,7 @@ developments in long form story writing/roleplaying scenarios, where
 context window length is limited (or ollama crashes on long context
 length models despite having 40 GB of unused memory!).
 
-## Configuration
+### Configuration
 
 The filter exposes two settings:
 
@@ -30,7 +42,7 @@ The filter hooks in to OpenWebUI's RAG settings to generate embeddings
 and query the vector database. The filter will use the same embedding
 model and ChromaDB instance that's configured in the admin settings.
 
-## Usage
+### Usage
 
 Enable the filter on a model that you want to use to generate stories.
 It is recommended, although not required, that this be the same model
@@ -47,7 +59,7 @@ filter is doing.
 Do not reply while the model is updating its knowledge base or funny
 things might happen.
 
-## Functioning
+### Function
 
 What does it do?
  - When receiving user input, generate search queries for vector DB
@@ -60,7 +72,7 @@ What does it do?
  - After receiving model narrative reply, generate character and plot
    info and stick them into the vector DB.
 
-## Limitations and Known Issues
+### Limitations and Known Issues
 
 What does it not do?
  - Handle conversational branching/regeneration. In fact, this will
@@ -68,10 +80,10 @@ What does it not do?
    - Bouncing around some ideas to fix this. Basically requires
      building a "canonical" branching story path in the database?
  - Proper context "chapter" summarization (planned to change).
- - Work properly when switching conversations due to OpenWebUI
+ - ~~Work properly when switching conversations due to OpenWebUI
    limitations. The chat ID is not available on incoming requests for
    some reason, so a janky workaround is used when processing LLM
-   responses.
+   responses.~~ Fixed! (but still in a very hacky way)
  - Clear out information of old conversations or expire irrelevant
    data.
 
@@ -84,6 +96,23 @@ Other things to do or improve:
    used, so multiple users = concurrency issues.
  - Block user input while updating the knowledgebase.
 
-## License
+## GPU Scaling Filter
+
+This is a simple filter that reduces the number of GPU layers in use
+by Ollama when it detects that Ollama has crashed (via empty response
+coming in to OpenWebUI). Right now, the logic is very basic, just
+using static numbers to reduce GPU layer counts. It doesn't take into
+account the number of layers in models or dynamically monitor VRAM
+use.
+
+There are three settings:
+ - **Initial Reduction:** Number of layers to immediately set when an
+   Ollama crash is detected. Defaults to 20.
+ - **Scaling Step:** Number of layers to reduce by on subsequent crashes
+   (down to a minimum of 0, i.e. 100% CPU inference). Defaults to 5.
+ - **Show Status:** Whether or not to inform the user that the
+   conversation is running slower due to GPU layer downscaling.
+
+# License
 
 AGPL v3.0+.