fix ollama tool calling failure when conversation contains multimodal content from live frame tool results

2026-05-06 21:45:32 +03:00 · 2026-05-06 09:16:23 -05:00 · 2026-05-06 09:16:23 -05:00 · d5effc52c0
commit d5effc52c0
parent 1972ba41fa
1 changed files with 41 additions and 2 deletions
--- a/frigate/genai/ollama.py
+++ b/frigate/genai/ollama.py
@ -1,5 +1,6 @@
 """Ollama Provider for Frigate AI."""

+import base64
 import json
 import logging
 from typing import Any, AsyncGenerator, Optional
@ -16,6 +17,41 @@ from frigate.genai.utils import parse_tool_calls_from_message
 logger = logging.getLogger(__name__)


+def _normalize_multimodal_content(
+    content: Any,
+) -> tuple[Optional[str], Optional[list[bytes]]]:
+    """Convert OpenAI-style multimodal content to Ollama's (text, images) shape.
+
+    The chat API constructs user messages with content as a list of
+    ``{"type": "text"}`` and ``{"type": "image_url"}`` parts when a tool
+    returns a live frame. Ollama's SDK requires content to be a string and
+    images to be passed in a separate field, so we extract each.
+    """
+    if not isinstance(content, list):
+        return content, None
+
+    text_parts: list[str] = []
+    images: list[bytes] = []
+    for part in content:
+        if not isinstance(part, dict):
+            continue
+        part_type = part.get("type")
+        if part_type == "text":
+            text = part.get("text")
+            if text:
+                text_parts.append(str(text))
+        elif part_type == "image_url":
+            url = (part.get("image_url") or {}).get("url", "")
+            if isinstance(url, str) and url.startswith("data:"):
+                try:
+                    encoded = url.split(",", 1)[1]
+                    images.append(base64.b64decode(encoded, validate=True))
+                except (ValueError, IndexError, base64.binascii.Error) as e:
+                    logger.debug("Failed to decode multimodal image url: %s", e)
+
+    return ("\n".join(text_parts) if text_parts else None), (images or None)
+
+
@register_genai_provider(GenAIProviderEnum.ollama)
 class OllamaClient(GenAIClient):
    """Generative AI client for Frigate using Ollama."""
@ -207,10 +243,13 @@ class OllamaClient(GenAIClient):
        """Build request_messages and params for chat (sync or stream)."""
        request_messages = []
        for msg in messages:
-            msg_dict = {
+            content, images = _normalize_multimodal_content(msg.get("content", ""))
+            msg_dict: dict[str, Any] = {
                "role": msg.get("role"),
-                "content": msg.get("content", ""),
+                "content": content if content is not None else "",
            }
+            if images:
+                msg_dict["images"] = images
            if msg.get("tool_call_id"):
                msg_dict["tool_call_id"] = msg["tool_call_id"]
            if msg.get("name"):