More robust GPU scaling - properly extract models, don't operate on OpenAI models.
This commit is contained in:
parent
1735f23173
commit
2eba340523
|
@ -2,7 +2,7 @@
|
||||||
title: GPU Scaling Filter
|
title: GPU Scaling Filter
|
||||||
author: projectmoon
|
author: projectmoon
|
||||||
author_url: https://git.agnos.is/projectmoon/open-webui-filters
|
author_url: https://git.agnos.is/projectmoon/open-webui-filters
|
||||||
version: 0.1.0
|
version: 0.2.0
|
||||||
license: AGPL-3.0+
|
license: AGPL-3.0+
|
||||||
required_open_webui_version: 0.3.9
|
required_open_webui_version: 0.3.9
|
||||||
"""
|
"""
|
||||||
|
@ -85,11 +85,19 @@ def dict_to_attributes(input_dict):
|
||||||
return AttrDict(input_dict)
|
return AttrDict(input_dict)
|
||||||
|
|
||||||
def extract_model_id(model: dict) -> Optional[str]:
|
def extract_model_id(model: dict) -> Optional[str]:
|
||||||
|
model_id = None
|
||||||
|
|
||||||
if "info" in model:
|
if "info" in model:
|
||||||
model_info = model["info"]
|
if "base_model_id" in model["info"]:
|
||||||
return model_info["base_model_id"] if "base_model_id" in model_info else model["id"]
|
model_id = model["info"]["base_model_id"]
|
||||||
else:
|
else:
|
||||||
return None
|
if "ollama" in model and "id" in model["ollama"]:
|
||||||
|
model_id = model["ollama"]["id"]
|
||||||
|
|
||||||
|
if not model_id:
|
||||||
|
model_id = model["id"]
|
||||||
|
|
||||||
|
return model_id
|
||||||
|
|
||||||
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
|
def extract_session_info(event_emitter) -> Optional[SessionInfo]:
|
||||||
"""The latest innovation in hacky workarounds."""
|
"""The latest innovation in hacky workarounds."""
|
||||||
|
@ -171,10 +179,15 @@ class Filter:
|
||||||
# parameter. if this is a subsequent failure (we have entry
|
# parameter. if this is a subsequent failure (we have entry
|
||||||
# for this chat already), reduce by the step valve parameter,
|
# for this chat already), reduce by the step valve parameter,
|
||||||
# to a minimum of CPU (100% cpu).
|
# to a minimum of CPU (100% cpu).
|
||||||
|
model_id = extract_model_id(model)
|
||||||
|
|
||||||
|
if not model_id:
|
||||||
|
print("Could not extract model ID for GPU downscaling!")
|
||||||
|
return
|
||||||
|
|
||||||
await self.send_message_adjusting(False)
|
await self.send_message_adjusting(False)
|
||||||
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
|
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
|
||||||
num_layers = self.get_num_layers_for_model(gpu_layer_info, model)
|
num_layers = self.get_num_layers_for_model(gpu_layer_info, model)
|
||||||
print(f"num layers is {num_layers}")
|
|
||||||
downscale_steps = 0
|
downscale_steps = 0
|
||||||
|
|
||||||
if num_layers:
|
if num_layers:
|
||||||
|
@ -186,13 +199,11 @@ class Filter:
|
||||||
else:
|
else:
|
||||||
num_layers = self.valves.reduction_start
|
num_layers = self.valves.reduction_start
|
||||||
|
|
||||||
model_id = extract_model_id(model)
|
gpu_layer_info.set_gpu_layers(model_id, num_layers)
|
||||||
if model_id:
|
await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps)
|
||||||
gpu_layer_info.set_gpu_layers(model_id, num_layers)
|
print(
|
||||||
await self.send_message_adjusting(True, amount=num_layers, steps=downscale_steps)
|
f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
|
||||||
print(
|
)
|
||||||
f"Set GPU layers for chat {self.session_info.chat_id} to {num_layers}"
|
|
||||||
)
|
|
||||||
|
|
||||||
async def inlet(
|
async def inlet(
|
||||||
self,
|
self,
|
||||||
|
@ -201,19 +212,25 @@ class Filter:
|
||||||
__model__: Optional[dict] = None,
|
__model__: Optional[dict] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Intercept incoming messages and downscale if necessary."""
|
"""Intercept incoming messages and downscale if necessary."""
|
||||||
|
if not __model__ or __model__["owned_by"] != "ollama":
|
||||||
|
return body
|
||||||
|
|
||||||
self.event_emitter = __event_emitter__
|
self.event_emitter = __event_emitter__
|
||||||
self.session_info = extract_session_info(__event_emitter__)
|
self.session_info = extract_session_info(__event_emitter__)
|
||||||
|
|
||||||
if self.session_info and __model__:
|
if self.session_info:
|
||||||
model_id = extract_model_id(__model__)
|
|
||||||
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
|
gpu_layer_info = GpuChatState(CHROMA_CLIENT, self.session_info.chat_id)
|
||||||
num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__)
|
num_layers = self.get_num_layers_for_model(gpu_layer_info, __model__)
|
||||||
|
|
||||||
if num_layers and "options" in body:
|
if num_layers and "options" in body:
|
||||||
|
model_id = extract_model_id(__model__)
|
||||||
body["options"]["num_gpu"] = num_layers
|
body["options"]["num_gpu"] = num_layers
|
||||||
if self.valves.show_status:
|
if self.valves.show_status:
|
||||||
await self.send_message_downscaled()
|
await self.send_message_downscaled()
|
||||||
print(f"Downscaled GPU layers for incoming request for {model_id} to {num_layers}")
|
print((
|
||||||
|
f"Downscaled GPU layers for incoming request for {model_id} "
|
||||||
|
f"to {num_layers}"
|
||||||
|
))
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
@ -225,6 +242,9 @@ class Filter:
|
||||||
__model__: Optional[dict] = None,
|
__model__: Optional[dict] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""On response failure, downscale the GPU layers for next try."""
|
"""On response failure, downscale the GPU layers for next try."""
|
||||||
|
if not __model__ or __model__["owned_by"] != "ollama":
|
||||||
|
return body
|
||||||
|
|
||||||
self.event_emitter = __event_emitter__
|
self.event_emitter = __event_emitter__
|
||||||
self.session_info = extract_session_info(__event_emitter__)
|
self.session_info = extract_session_info(__event_emitter__)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue