GPU layer scaling filter

This commit is contained in:
projectmoon 2024-07-18 21:49:35 +02:00
parent ec5f748964
commit 9e38ccc33d
1 changed files with 174 additions and 50 deletions

View File

@ -1,38 +1,76 @@
""" """
title: GPU scaling router title: GPU Scaling Filter
author: open-webui, atgehrhardt author: projectmoon
author_url: https://github.com/open-webui author_url: https://git.agnos.is/projectmoon/open-webui-filters
funding_url: https://github.com/open-webui version: 0.1.0
version: 0.1.4 required_open_webui_version: 0.3.9
required_open_webui_version: 0.3.8
""" """
import chromadb
from chromadb import ClientAPI as ChromaAPI
from chromadb import Collection as ChromaCollection
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import Callable, Awaitable, Any, Optional, Literal from typing import Callable, Awaitable, Any, Optional, Literal
import json import json
# OpenWebUI imports
from config import CHROMA_CLIENT
from utils.misc import get_last_user_message, get_last_assistant_message from utils.misc import get_last_user_message, get_last_assistant_message
from apps.ollama.main import generate_chat_completion, GenerateChatCompletionForm from apps.ollama.main import generate_chat_completion, GenerateChatCompletionForm
from apps.webui.models.users import UserModel from apps.webui.models.users import UserModel
# To get ROCm VRAM use: rocm-smi --showmeminfo vram --json class GpuChatState:
# To figure out GPU layers in use: janky ass bullshit! """
# 1. Use ollama API to get modelfile from model info. Get or set GPU layer count by base model for a given chat.
# 2. Pull actual file path of model out of the modelfile. """
# 3. Scan running processes for the one that is using our file.
# 4. Parse its command line to get number of GPU layers.
# How to stabilize VRAM use: we don't want to change layers all the collection_name = "gpu_layers_by_chat"
# time, because it'll cause the model to reload a lot.
# We need to maintain state per convo (yay). Shove it into ChromaDB!
# Could also try summing up tokens? Or calculating vram use of model def __init__(self, chroma_client: ChromaAPI, chat_id: str):
# vs vram use of rocm, and do nothing if below % self.chroma_client = chroma_client
self.chat_id = chat_id
self.gpu_layers = {}
def _get_collection(self) -> ChromaCollection:
return self.chroma_client.get_or_create_collection(
name=GpuChatState.collection_name
)
def _parse_results(self, results) -> dict:
if 'documents' in results:
doc = results['documents'][0] if len(results['documents']) > 0 else None
return json.loads(doc) if doc else {}
else:
return {}
def get_gpu_layers(self):
coll = self._get_collection()
if self.gpu_layers == {}:
self.gpu_layers = self._parse_results(
coll.get(ids=[self.chat_id], include=["documents"])
)
return self.gpu_layers
def get_gpu_layers_for_model(self, model_id: str) -> Optional[int]:
info = self.get_gpu_layers()
return info[model_id] if model_id in info else None
def set_gpu_layers(self, model: str, amount: int):
# set gpu layers for this chat.
self.gpu_layers[model] = amount
self._get_collection().upsert(
ids=[self.chat_id],
documents=[json.dumps(self.gpu_layers)]
)
self.gpu_layers = self.get_gpu_layers()
def write_log(text): class SessionInfo(BaseModel):
with open(f"/tmp/test-memories", "a") as file: chat_id: str
file.write(text + "\n") message_id: str
session_id: str
def dict_to_attributes(input_dict): def dict_to_attributes(input_dict):
class AttrDict: class AttrDict:
@ -42,18 +80,35 @@ def dict_to_attributes(input_dict):
return AttrDict(input_dict) return AttrDict(input_dict)
def convert_user(user): def extract_model_id(model: dict) -> Optional[str]:
user['info'] = {} if "info" in model:
return dict_to_attributes(user) model_info = model["info"]
return model_info["base_model_id"] if "base_model_id" in model_info else model["id"]
else:
return None
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
"""The latest innovation in hacky workarounds."""
try:
info = event_emitter.__closure__[0].cell_contents
return SessionInfo(
chat_id=info["chat_id"],
message_id=info["message_id"],
session_id=info["session_id"]
)
except:
return None
class Filter: class Filter:
class Valves(BaseModel): class Valves(BaseModel):
scaling_start: int = Field( reduction_start: int = Field(
default=90, default=20, description="Amount of GPU layers to reduce to immediately on failure"
description="VRAM usage percent to start scaling back GPU layers",
) )
scaling_step: int = Field( scaling_step: int = Field(
default=3, description="Amount of GPU layers to reduce" default=5, description="Amount of GPU layers to reduce by on continued failures"
)
show_status: bool = Field(
default=True, description="Show status message when running downscaled model."
) )
pass pass
@ -61,27 +116,79 @@ class Filter:
self.valves = self.Valves() self.valves = self.Valves()
pass pass
async def message_adjusting(self, done: bool): async def send_message_adjusting(self, done: bool, amount: int=0, steps: int=0):
if steps > 0:
steps_desc = f"reduced by {steps}"
else:
steps_desc = "initial reduction"
desc = (
"Downscaling GPU layers..." if not done
else f"GPU layers downscaled to {amount} ({steps_desc}). Please retry.")
await self.event_emitter( await self.event_emitter(
{ {
"type": "status", "type": "status",
"data": { "data": {
"description": "Adjusting GPU layers", "description": desc,
"done": done, "done": done
}, },
} }
) )
async def retry_message(self, body, user): async def send_message_downscaled(self):
request = GenerateChatCompletionForm( await self.event_emitter(
model=body["model"], {
messages=body["messages"], "type": "status",
stream=False, "data": {
keep_alive="10s", "description": "Running at reduced GPU capacity. Responses will be slower.",
options={"num_gpu": 1}, "done": True
},
}
) )
return await generate_chat_completion(request, user=user) def get_num_layers_for_model(
self,
gpu_layer_info: GpuChatState,
__model__: dict
) -> Optional[int]:
model_id = extract_model_id(__model__)
if model_id:
return gpu_layer_info.get_gpu_layers_for_model(model_id)
else:
return None
async def downscale(self, model):
"""Update tracked downscale GPU layers for this chat + model."""
# this logic is currently very basic. does not yet take into
# account the actual number of layers in a model. but it's
# better than nothing. if this is the first failure (no entry
# in gpu chat state), set number of layers to the valve
# parameter. if this is a subsequent failure (we have entry
# for this chat already), reduce by the step valve parameter,
# to a minimum of CPU (100% cpu).
await self.send_message_adjusting(False)
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
num_layers = self.get_num_layers_for_model(gpu_layer_info, model)
print(f"num layers is {num_layers}")
downscale_steps = 0
if num_layers:
print(f"Downscaling layers by {self.valves.scaling_step}")
num_layers -= self.valves.scaling_step
downscale_steps = self.valves.scaling_step
if num_layers < 0:
num_layers = 0
else:
num_layers = self.valves.reduction_start
model_id = extract_model_id(model)
if model_id:
gpu_layer_info.set_gpu_layers(model_id, num_layers)
await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps)
print(
f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
)
async def inlet( async def inlet(
self, self,
@ -89,7 +196,21 @@ class Filter:
__event_emitter__: Callable[[Any], Awaitable[None]], __event_emitter__: Callable[[Any], Awaitable[None]],
__model__: Optional[dict] = None, __model__: Optional[dict] = None,
) -> dict: ) -> dict:
"""Intercept incoming messages and downscale if necessary."""
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.session_info = extract_session_info(__event_emitter__)
if self.session_info and __model__:
model_id = extract_model_id(__model__)
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__)
if num_layers and "options" in body:
body["options"]["num_gpu"] = num_layers
if self.valves.show_status:
await self.send_message_downscaled()
print(f"Downscaled GPU layers for incoming request for {model_id} to {num_layers}")
return body return body
async def outlet( async def outlet(
@ -99,23 +220,26 @@ class Filter:
__event_emitter__: Callable[[Any], Awaitable[None]], __event_emitter__: Callable[[Any], Awaitable[None]],
__model__: Optional[dict] = None, __model__: Optional[dict] = None,
) -> dict: ) -> dict:
user = convert_user(__user__) """On response failure, downscale the GPU layers for next try."""
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.session_info = extract_session_info(__event_emitter__)
if not self.session_info or not __model__:
return body
if len(body["messages"]) == 0: if len(body["messages"]) == 0:
return body return body
message = body["messages"][-1] last_reply = body["messages"][-1]
write_log("got a message") broke = last_reply["content"] == "" and last_reply["info"] == {}
write_log(f"message: {str(message)}")
broke = message["content"] == "" and message["info"] == {}
if broke: if broke:
# at this point, we COULD set status and attempt to reduce # while we could actually redo the message itself, it is
# the GPU layers? # useless, because open web ui does not currently have a
await self.message_adjusting(False) # way to clear error state when message content is
del body["messages"][-1] # replaced. so we just lower gpu layers and tell user to
retried = await self.retry_message(body, user) # try again. the inlet will intercept the incoming request
await self.message_adjusting(True) # and lower the gpu layers.
message["content"] = get_last_assistant_message(retried) await self.downscale(__model__)
return body return body