More robust GPU scaling - properly extract models, don't operate on OpenAI models.

This commit is contained in:
projectmoon 2024-07-22 20:49:35 +02:00
parent 1735f23173
commit 2eba340523
1 changed files with 35 additions and 15 deletions

View File

@ -2,7 +2,7 @@
title: GPU Scaling Filter title: GPU Scaling Filter
author: projectmoon author: projectmoon
author_url: https://git.agnos.is/projectmoon/open-webui-filters author_url: https://git.agnos.is/projectmoon/open-webui-filters
version: 0.1.0 version: 0.2.0
license: AGPL-3.0+ license: AGPL-3.0+
required_open_webui_version: 0.3.9 required_open_webui_version: 0.3.9
""" """
@ -85,11 +85,19 @@ def dict_to_attributes(input_dict):
return AttrDict(input_dict) return AttrDict(input_dict)
def extract_model_id(model: dict) -> Optional[str]: def extract_model_id(model: dict) -> Optional[str]:
model_id = None
if "info" in model: if "info" in model:
model_info = model["info"] if "base_model_id" in model["info"]:
return model_info["base_model_id"] if "base_model_id" in model_info else model["id"] model_id = model["info"]["base_model_id"]
else: else:
return None if "ollama" in model and "id" in model["ollama"]:
model_id = model["ollama"]["id"]
if not model_id:
model_id = model["id"]
return model_id
def extract_session_info(event_emitter) -> Optional[SessionInfo]: def extract_session_info(event_emitter) -> Optional[SessionInfo]:
"""The latest innovation in hacky workarounds.""" """The latest innovation in hacky workarounds."""
@ -171,10 +179,15 @@ class Filter:
# parameter. if this is a subsequent failure (we have entry # parameter. if this is a subsequent failure (we have entry
# for this chat already), reduce by the step valve parameter, # for this chat already), reduce by the step valve parameter,
# to a minimum of CPU (100% cpu). # to a minimum of CPU (100% cpu).
model_id = extract_model_id(model)
if not model_id:
print("Could not extract model ID for GPU downscaling!")
return
await self.send_message_adjusting(False) await self.send_message_adjusting(False)
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id) gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
num_layers = self.get_num_layers_for_model(gpu_layer_info, model) num_layers = self.get_num_layers_for_model(gpu_layer_info, model)
print(f"num layers is {num_layers}")
downscale_steps = 0 downscale_steps = 0
if num_layers: if num_layers:
@ -186,13 +199,11 @@ class Filter:
else: else:
num_layers = self.valves.reduction_start num_layers = self.valves.reduction_start
model_id = extract_model_id(model) gpu_layer_info.set_gpu_layers(model_id, num_layers)
if model_id: await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps)
gpu_layer_info.set_gpu_layers(model_id, num_layers) print(
await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps) f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
print( )
f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
)
async def inlet( async def inlet(
self, self,
@ -201,19 +212,25 @@ class Filter:
__model__: Optional[dict] = None, __model__: Optional[dict] = None,
) -> dict: ) -> dict:
"""Intercept incoming messages and downscale if necessary.""" """Intercept incoming messages and downscale if necessary."""
if not __model__ or __model__["owned_by"] != "ollama":
return body
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.session_info = extract_session_info(__event_emitter__) self.session_info = extract_session_info(__event_emitter__)
if self.session_info and __model__: if self.session_info:
model_id = extract_model_id(__model__)
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id) gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__) num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__)
if num_layers and "options" in body: if num_layers and "options" in body:
model_id = extract_model_id(__model__)
body["options"]["num_gpu"] = num_layers body["options"]["num_gpu"] = num_layers
if self.valves.show_status: if self.valves.show_status:
await self.send_message_downscaled() await self.send_message_downscaled()
print(f"Downscaled GPU layers for incoming request for {model_id} to {num_layers}") print((
f"Downscaled GPU layers for incoming request for {model_id} "
f"to {num_layers}"
))
return body return body
@ -225,6 +242,9 @@ class Filter:
__model__: Optional[dict] = None, __model__: Optional[dict] = None,
) -> dict: ) -> dict:
"""On response failure, downscale the GPU layers for next try.""" """On response failure, downscale the GPU layers for next try."""
if not __model__ or __model__["owned_by"] != "ollama":
return body
self.event_emitter = __event_emitter__ self.event_emitter = __event_emitter__
self.session_info = extract_session_info(__event_emitter__) self.session_info = extract_session_info(__event_emitter__)