Auto llama.cpp context (#22737)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions

* Add model probing

* Include aliases

* Pull correctly

* Correctly query specific model props

* Debug log

* Update model list
This commit is contained in:
Nicolas Mowen 2026-04-02 19:13:34 -06:00 committed by GitHub
parent 520d9eeb7f
commit 68dfb157ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 124 additions and 6 deletions

View File

@ -29,11 +29,11 @@ You must use a vision-capable model with Frigate. The following models are recom
| Model | Notes | | Model | Notes |
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `qwen3-vl` | Strong visual and situational understanding, strong ability to identify smaller objects and interactions with object. | | `qwen3-vl` | Strong visual and situational understanding, enhanced ability to identify smaller objects and interactions with object. |
| `qwen3.5` | Strong situational understanding, but missing DeepStack from qwen3-vl leading to worse performance for identifying objects in people's hand and other small details. | | `qwen3.5` | Strong situational understanding, but missing DeepStack from qwen3-vl leading to worse performance for identifying objects in people's hand and other small details. |
| `gemma4` | Strong situational understanding, sometimes resorts to more vague terms like 'interacts' instead of assigning a specific action. |
| `Intern3.5VL` | Relatively fast with good vision comprehension | | `Intern3.5VL` | Relatively fast with good vision comprehension |
| `gemma3` | Slower model with good vision and temporal understanding | | `gemma3` | Slower model with good vision and temporal understanding |
| `qwen2.5-vl` | Fast but capable model with good vision comprehension |
:::info :::info

View File

@ -38,18 +38,111 @@ class LlamaCppClient(GenAIClient):
provider: str | None # base_url provider: str | None # base_url
provider_options: dict[str, Any] provider_options: dict[str, Any]
_context_size: int | None
_supports_vision: bool
_supports_audio: bool
_supports_tools: bool
def _init_provider(self) -> str | None: def _init_provider(self) -> str | None:
"""Initialize the client.""" """Initialize the client and query model metadata from the server."""
self.provider_options = { self.provider_options = {
**self.genai_config.provider_options, **self.genai_config.provider_options,
} }
return ( self._context_size = None
self._supports_vision = False
self._supports_audio = False
self._supports_tools = False
base_url = (
self.genai_config.base_url.rstrip("/") self.genai_config.base_url.rstrip("/")
if self.genai_config.base_url if self.genai_config.base_url
else None else None
) )
if base_url is None:
return None
configured_model = self.genai_config.model
# Query /v1/models to validate the configured model exists
try:
response = requests.get(
f"{base_url}/v1/models",
timeout=10,
)
response.raise_for_status()
models_data = response.json()
model_found = False
for model in models_data.get("data", []):
model_ids = {model.get("id")}
for alias in model.get("aliases", []):
model_ids.add(alias)
if configured_model in model_ids:
model_found = True
break
if not model_found:
available = []
for m in models_data.get("data", []):
available.append(m.get("id", "unknown"))
for alias in m.get("aliases", []):
available.append(alias)
logger.error(
"Model '%s' not found on llama.cpp server. Available models: %s",
configured_model,
available,
)
return None
except Exception as e:
logger.warning(
"Failed to query llama.cpp /v1/models endpoint: %s. "
"Model validation skipped.",
e,
)
# Query /props for context size, modalities, and tool support
try:
response = requests.get(
f"{base_url}/props",
params={"model": configured_model},
timeout=10,
)
response.raise_for_status()
props = response.json()
# Context size from server runtime config
default_settings = props.get("default_generation_settings", {})
n_ctx = default_settings.get("n_ctx")
if n_ctx:
self._context_size = int(n_ctx)
# Modalities (vision, audio)
modalities = props.get("modalities", {})
self._supports_vision = modalities.get("vision", False)
self._supports_audio = modalities.get("audio", False)
# Tool support from chat template capabilities
chat_caps = props.get("chat_template_caps", {})
self._supports_tools = chat_caps.get("supports_tools", False)
logger.debug(
"llama.cpp model '%s' initialized — context: %s, vision: %s, audio: %s, tools: %s",
configured_model,
self._context_size or "unknown",
self._supports_vision,
self._supports_audio,
self._supports_tools,
)
except Exception as e:
logger.warning(
"Failed to query llama.cpp /props endpoint: %s. "
"Using defaults for context size and capabilities.",
e,
)
return base_url
def _send( def _send(
self, self,
prompt: str, prompt: str,
@ -117,9 +210,34 @@ class LlamaCppClient(GenAIClient):
logger.warning("llama.cpp returned an error: %s", str(e)) logger.warning("llama.cpp returned an error: %s", str(e))
return None return None
@property
def supports_vision(self) -> bool:
"""Whether the loaded model supports vision/image input."""
return self._supports_vision
@property
def supports_audio(self) -> bool:
"""Whether the loaded model supports audio input."""
return self._supports_audio
@property
def supports_tools(self) -> bool:
"""Whether the loaded model supports tool/function calling."""
return self._supports_tools
def get_context_size(self) -> int: def get_context_size(self) -> int:
"""Get the context window size for llama.cpp.""" """Get the context window size for llama.cpp.
return int(self.provider_options.get("context_size", 4096))
Resolution order:
1. provider_options["context_size"] (user override)
2. Value queried from llama.cpp server at init
3. Default fallback of 4096
"""
if "context_size" in self.provider_options:
return int(self.provider_options["context_size"])
if self._context_size is not None:
return self._context_size
return 4096
def _build_payload( def _build_payload(
self, self,