From d5effc52c0fc18b03801866486b984befdccd1f2 Mon Sep 17 00:00:00 2001
From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
Date: Wed, 6 May 2026 09:16:23 -0500
Subject: [PATCH] fix ollama tool calling failure when conversation contains
 multimodal content from live frame tool results

---
 frigate/genai/ollama.py | 43 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/frigate/genai/ollama.py b/frigate/genai/ollama.py
index 6ba803bcc..6d4931466 100644
--- a/frigate/genai/ollama.py
+++ b/frigate/genai/ollama.py
@@ -1,5 +1,6 @@
 """Ollama Provider for Frigate AI."""
 
+import base64
 import json
 import logging
 from typing import Any, AsyncGenerator, Optional
@@ -16,6 +17,41 @@ from frigate.genai.utils import parse_tool_calls_from_message
 logger = logging.getLogger(__name__)
 
 
+def _normalize_multimodal_content(
+    content: Any,
+) -> tuple[Optional[str], Optional[list[bytes]]]:
+    """Convert OpenAI-style multimodal content to Ollama's (text, images) shape.
+
+    The chat API constructs user messages with content as a list of
+    ``{"type": "text"}`` and ``{"type": "image_url"}`` parts when a tool
+    returns a live frame. Ollama's SDK requires content to be a string and
+    images to be passed in a separate field, so we extract each.
+    """
+    if not isinstance(content, list):
+        return content, None
+
+    text_parts: list[str] = []
+    images: list[bytes] = []
+    for part in content:
+        if not isinstance(part, dict):
+            continue
+        part_type = part.get("type")
+        if part_type == "text":
+            text = part.get("text")
+            if text:
+                text_parts.append(str(text))
+        elif part_type == "image_url":
+            url = (part.get("image_url") or {}).get("url", "")
+            if isinstance(url, str) and url.startswith("data:"):
+                try:
+                    encoded = url.split(",", 1)[1]
+                    images.append(base64.b64decode(encoded, validate=True))
+                except (ValueError, IndexError, base64.binascii.Error) as e:
+                    logger.debug("Failed to decode multimodal image url: %s", e)
+
+    return ("\n".join(text_parts) if text_parts else None), (images or None)
+
+
 @register_genai_provider(GenAIProviderEnum.ollama)
 class OllamaClient(GenAIClient):
     """Generative AI client for Frigate using Ollama."""
@@ -207,10 +243,13 @@ class OllamaClient(GenAIClient):
         """Build request_messages and params for chat (sync or stream)."""
         request_messages = []
         for msg in messages:
-            msg_dict = {
+            content, images = _normalize_multimodal_content(msg.get("content", ""))
+            msg_dict: dict[str, Any] = {
                 "role": msg.get("role"),
-                "content": msg.get("content", ""),
+                "content": content if content is not None else "",
             }
+            if images:
+                msg_dict["images"] = images
             if msg.get("tool_call_id"):
                 msg_dict["tool_call_id"] = msg["tool_call_id"]
             if msg.get("name"):