fix ollama tool calling failure when conversation contains multimodal content from live frame tool results

This commit is contained in:
Josh Hawkins 2026-05-06 09:16:23 -05:00
parent 1972ba41fa
commit d5effc52c0

View File

@ -1,5 +1,6 @@
"""Ollama Provider for Frigate AI."""
import base64
import json
import logging
from typing import Any, AsyncGenerator, Optional
@ -16,6 +17,41 @@ from frigate.genai.utils import parse_tool_calls_from_message
logger = logging.getLogger(__name__)
def _normalize_multimodal_content(
content: Any,
) -> tuple[Optional[str], Optional[list[bytes]]]:
"""Convert OpenAI-style multimodal content to Ollama's (text, images) shape.
The chat API constructs user messages with content as a list of
``{"type": "text"}`` and ``{"type": "image_url"}`` parts when a tool
returns a live frame. Ollama's SDK requires content to be a string and
images to be passed in a separate field, so we extract each.
"""
if not isinstance(content, list):
return content, None
text_parts: list[str] = []
images: list[bytes] = []
for part in content:
if not isinstance(part, dict):
continue
part_type = part.get("type")
if part_type == "text":
text = part.get("text")
if text:
text_parts.append(str(text))
elif part_type == "image_url":
url = (part.get("image_url") or {}).get("url", "")
if isinstance(url, str) and url.startswith("data:"):
try:
encoded = url.split(",", 1)[1]
images.append(base64.b64decode(encoded, validate=True))
except (ValueError, IndexError, base64.binascii.Error) as e:
logger.debug("Failed to decode multimodal image url: %s", e)
return ("\n".join(text_parts) if text_parts else None), (images or None)
@register_genai_provider(GenAIProviderEnum.ollama)
class OllamaClient(GenAIClient):
"""Generative AI client for Frigate using Ollama."""
@ -207,10 +243,13 @@ class OllamaClient(GenAIClient):
"""Build request_messages and params for chat (sync or stream)."""
request_messages = []
for msg in messages:
msg_dict = {
content, images = _normalize_multimodal_content(msg.get("content", ""))
msg_dict: dict[str, Any] = {
"role": msg.get("role"),
"content": msg.get("content", ""),
"content": content if content is not None else "",
}
if images:
msg_dict["images"] = images
if msg.get("tool_call_id"):
msg_dict["tool_call_id"] = msg["tool_call_id"]
if msg.get("name"):