GPU layer scaling filter

2024-07-18 21:49:35 +02:00 · 2024-07-18 21:49:35 +02:00 · 9e38ccc33d
parent ec5f748964
commit 9e38ccc33d
1 changed files with 174 additions and 50 deletions
--- a/gpu_layer_scaler.py
+++ b/gpu_layer_scaler.py
@ -1,38 +1,76 @@
 """
-title: GPU scaling router
+title: GPU Scaling Filter
-author: open-webui, atgehrhardt
+author: projectmoon
-author_url: https://github.com/open-webui
+author_url: https://git.agnos.is/projectmoon/open-webui-filters
-funding_url: https://github.com/open-webui
+version: 0.1.0
-version: 0.1.4
+required_open_webui_version: 0.3.9
 required_open_webui_version: 0.3.8
 """
 import chromadb
 from chromadb import ClientAPI as ChromaAPI
 from chromadb import Collection as ChromaCollection
 from pydantic import BaseModel, Field
 from typing import Callable, Awaitable, Any, Optional, Literal
 import json
 # OpenWebUI imports
 from config import CHROMA_CLIENT
 from utils.misc import get_last_user_message, get_last_assistant_message
 from apps.ollama.main import generate_chat_completion, GenerateChatCompletionForm
 from apps.webui.models.users import UserModel
-# To get ROCm VRAM use: rocm-smi --showmeminfo vram --json
+class GpuChatState:
-# To figure out GPU layers in use: janky ass bullshit!
+    """
-#  1. Use ollama API to get modelfile from model info.
+    Get or set GPU layer count by base model for a given chat.
-#  2. Pull actual file path of model out of the modelfile.
+    """
 #  3. Scan running processes for the one that is using our file.
 #  4. Parse its command line to get number of GPU layers.
-# How to stabilize VRAM use: we don't want to change layers all the
+    collection_name = "gpu_layers_by_chat"
 # time, because it'll cause the model to reload a lot.
 # We need to maintain state per convo (yay). Shove it into ChromaDB!
-# Could also try summing up tokens? Or calculating vram use of model
+    def __init__(self, chroma_client: ChromaAPI, chat_id: str):
-# vs vram use of rocm, and do nothing if below %
+        self.chroma_client = chroma_client
        self.chat_id = chat_id
        self.gpu_layers = {}
    def _get_collection(self) -> ChromaCollection:
        return self.chroma_client.get_or_create_collection(
            name=GpuChatState.collection_name
        )
    def _parse_results(self, results) -> dict:
        if 'documents' in results:
            doc = results['documents'][0] if len(results['documents']) > 0 else None
            return json.loads(doc) if doc else {}
        else:
            return {}
    def get_gpu_layers(self):
        coll = self._get_collection()
        if self.gpu_layers == {}:
            self.gpu_layers = self._parse_results(
                coll.get(ids=[self.chat_id], include=["documents"])
            )
        return self.gpu_layers
    def get_gpu_layers_for_model(self, model_id: str) -> Optional[int]:
        info = self.get_gpu_layers()
        return info[model_id] if model_id in info else None
    def set_gpu_layers(self, model: str, amount: int):
        # set gpu layers for this chat.
        self.gpu_layers[model] = amount
        self._get_collection().upsert(
            ids=[self.chat_id],
            documents=[json.dumps(self.gpu_layers)]
        )
        self.gpu_layers = self.get_gpu_layers()
-def write_log(text):
+class SessionInfo(BaseModel):
-    with open(f"/tmp/test-memories", "a") as file:
+    chat_id: str
-        file.write(text + "\n")
+    message_id: str
    session_id: str
 def dict_to_attributes(input_dict):
    class AttrDict:
@ -42,18 +80,35 @@ def dict_to_attributes(input_dict):
    return AttrDict(input_dict)
-def convert_user(user):
+def extract_model_id(model: dict) -> Optional[str]:
-    user['info'] = {}
+    if "info" in model:
-    return dict_to_attributes(user)
+        model_info = model["info"]
        return model_info["base_model_id"] if "base_model_id" in model_info else model["id"]
    else:
        return None
 def extract_session_info(event_emitter) -> Optional[SessionInfo]:
    """The latest innovation in hacky workarounds."""
    try:
        info = event_emitter.__closure__[0].cell_contents
        return SessionInfo(
            chat_id=info["chat_id"],
            message_id=info["message_id"],
            session_id=info["session_id"]
        )
    except:
        return None
 class Filter:
    class Valves(BaseModel):
-        scaling_start: int = Field(
+        reduction_start: int = Field(
-            default=90,
+            default=20, description="Amount of GPU layers to reduce to immediately on failure"
            description="VRAM usage percent to start scaling back GPU layers",
        )
        scaling_step: int = Field(
-            default=3, description="Amount of GPU layers to reduce"
+            default=5, description="Amount of GPU layers to reduce by on continued failures"
        )
        show_status: bool = Field(
            default=True, description="Show status message when running downscaled model."
        )
        pass
@ -61,27 +116,79 @@ class Filter:
        self.valves = self.Valves()
        pass
-    async def message_adjusting(self, done: bool):
+    async def send_message_adjusting(self, done: bool, amount: int=0, steps: int=0):
        if steps > 0:
            steps_desc = f"reduced by {steps}"
        else:
            steps_desc = "initial reduction"
        desc = (
            "Downscaling GPU layers..." if not done
            else f"GPU layers downscaled to {amount} ({steps_desc}). Please retry.")
        await self.event_emitter(
            {
                "type": "status",
                "data": {
-                    "description": "Adjusting GPU layers",
+                    "description": desc,
-                    "done": done,
+                    "done": done
                },
            }
        )
-    async def retry_message(self, body, user):
+    async def send_message_downscaled(self):
-        request = GenerateChatCompletionForm(
+        await self.event_emitter(
-            model=body["model"],
+            {
-            messages=body["messages"],
+                "type": "status",
-            stream=False,
+                "data": {
-            keep_alive="10s",
+                    "description": "Running at reduced GPU capacity. Responses will be slower.",
-            options={"num_gpu": 1},
+                    "done": True
                },
            }
        )
-        return await generate_chat_completion(request, user=user)
+    def get_num_layers_for_model(
            self,
            gpu_layer_info: GpuChatState,
            __model__: dict
    ) -> Optional[int]:
        model_id = extract_model_id(__model__)
        if model_id:
            return gpu_layer_info.get_gpu_layers_for_model(model_id)
        else:
            return None
    async def downscale(self, model):
        """Update tracked downscale GPU layers for this chat + model."""
        # this logic is currently very basic. does not yet take into
        # account the actual number of layers in a model. but it's
        # better than nothing. if this is the first failure (no entry
        # in gpu chat state), set number of layers to the valve
        # parameter. if this is a subsequent failure (we have entry
        # for this chat already), reduce by the step valve parameter,
        # to a minimum of CPU (100% cpu).
        await self.send_message_adjusting(False)
        gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
        num_layers = self.get_num_layers_for_model(gpu_layer_info, model)
        print(f"num layers is {num_layers}")
        downscale_steps = 0
        if num_layers:
            print(f"Downscaling layers by {self.valves.scaling_step}")
            num_layers -= self.valves.scaling_step
            downscale_steps = self.valves.scaling_step
            if num_layers < 0:
                num_layers = 0
        else:
            num_layers = self.valves.reduction_start
        model_id = extract_model_id(model)
        if model_id:
            gpu_layer_info.set_gpu_layers(model_id, num_layers)
            await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps)
            print(
                f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
            )
    async def inlet(
        self,
@ -89,7 +196,21 @@ class Filter:
        __event_emitter__: Callable[[Any], Awaitable[None]],
        __model__: Optional[dict] = None,
    ) -> dict:
        """Intercept incoming messages and downscale if necessary."""
        self.event_emitter = __event_emitter__
        self.session_info = extract_session_info(__event_emitter__)
        if self.session_info and __model__:
            model_id = extract_model_id(__model__)
            gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
            num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__)
            if num_layers and "options" in body:
                body["options"]["num_gpu"] = num_layers
                if self.valves.show_status:
                    await self.send_message_downscaled()
                print(f"Downscaled GPU layers for incoming request for {model_id} to {num_layers}")
        return body
    async def outlet(
@ -99,23 +220,26 @@ class Filter:
        __event_emitter__: Callable[[Any], Awaitable[None]],
        __model__: Optional[dict] = None,
    ) -> dict:
-        user = convert_user(__user__)
+        """On response failure, downscale the GPU layers for next try."""
        self.event_emitter = __event_emitter__
        self.session_info = extract_session_info(__event_emitter__)
        if not self.session_info or not __model__:
            return body
        if len(body["messages"]) == 0:
            return body
-        message = body["messages"][-1]
+        last_reply = body["messages"][-1]
-        write_log("got a message")
+        broke = last_reply["content"] == "" and last_reply["info"] == {}
        write_log(f"message: {str(message)}")
        broke = message["content"] == "" and message["info"] == {}
        if broke:
-            # at this point, we COULD set status and attempt to reduce
+            # while we could actually redo the message itself, it is
-            # the GPU layers?
+            # useless, because open web ui does not currently have a
-            await self.message_adjusting(False)
+            # way to clear error state when message content is
-            del body["messages"][-1]
+            # replaced. so we just lower gpu layers and tell user to
-            retried = await self.retry_message(body, user)
+            # try again. the inlet will intercept the incoming request
-            await self.message_adjusting(True)
+            # and lower the gpu layers.
-            message["content"] = get_last_assistant_message(retried)
+            await self.downscale(__model__)
        return body