Fix llama.cpp media marker

2026-05-01 19:17:41 +03:00 · 2026-04-29 08:43:32 -06:00 · 2026-04-29 08:43:32 -06:00 · 11bb9fed4c
commit 11bb9fed4c
parent 3201985359
1 changed files with 12 additions and 2 deletions
--- a/frigate/genai/llama_cpp.py
+++ b/frigate/genai/llama_cpp.py
@ -44,6 +44,7 @@ class LlamaCppClient(GenAIClient):
    _supports_tools: bool
    _image_token_cache: dict[tuple[int, int], int]
    _text_baseline_tokens: int | None
+    _media_marker: str

    def _init_provider(self) -> str | None:
        """Initialize the client and query model metadata from the server."""
@ -56,6 +57,7 @@ class LlamaCppClient(GenAIClient):
        self._supports_tools = False
        self._image_token_cache = {}
        self._text_baseline_tokens = None
+        self._media_marker = "<__media__>"

        base_url = (
            self.genai_config.base_url.rstrip("/")
@ -141,6 +143,13 @@ class LlamaCppClient(GenAIClient):
            chat_caps = props.get("chat_template_caps", {})
            self._supports_tools = chat_caps.get("supports_tools", False)

+            # Media marker for multimodal embeddings; the server randomizes this
+            # per startup unless LLAMA_MEDIA_MARKER is set, so we must read it
+            # from /props rather than hardcoding "<__media__>".
+            media_marker = props.get("media_marker")
+            if isinstance(media_marker, str) and media_marker:
+                self._media_marker = media_marker
+
            logger.info(
                "llama.cpp model '%s' initialized — context: %s, vision: %s, audio: %s, tools: %s",
                configured_model,
@ -465,10 +474,11 @@ class LlamaCppClient(GenAIClient):
            jpeg_bytes = _to_jpeg(img)
            to_encode = jpeg_bytes if jpeg_bytes is not None else img
            encoded = base64.b64encode(to_encode).decode("utf-8")
-            # prompt_string must contain <__media__> placeholder for image tokenization
+            # prompt_string must contain the server's media marker placeholder.
+            # The marker is randomized per server startup (read from /props).
            content.append(
                {
-                    "prompt_string": "<__media__>\n",
+                    "prompt_string": f"{self._media_marker}\n",
                    "multimodal_data": [encoded],  # type: ignore[dict-item]
                }
            )