mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-07 05:55:27 +03:00
fix ollama tool calling failure when conversation contains multimodal content from live frame tool results
This commit is contained in:
parent
1972ba41fa
commit
d5effc52c0
@ -1,5 +1,6 @@
|
|||||||
"""Ollama Provider for Frigate AI."""
|
"""Ollama Provider for Frigate AI."""
|
||||||
|
|
||||||
|
import base64
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, AsyncGenerator, Optional
|
from typing import Any, AsyncGenerator, Optional
|
||||||
@ -16,6 +17,41 @@ from frigate.genai.utils import parse_tool_calls_from_message
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_multimodal_content(
|
||||||
|
content: Any,
|
||||||
|
) -> tuple[Optional[str], Optional[list[bytes]]]:
|
||||||
|
"""Convert OpenAI-style multimodal content to Ollama's (text, images) shape.
|
||||||
|
|
||||||
|
The chat API constructs user messages with content as a list of
|
||||||
|
``{"type": "text"}`` and ``{"type": "image_url"}`` parts when a tool
|
||||||
|
returns a live frame. Ollama's SDK requires content to be a string and
|
||||||
|
images to be passed in a separate field, so we extract each.
|
||||||
|
"""
|
||||||
|
if not isinstance(content, list):
|
||||||
|
return content, None
|
||||||
|
|
||||||
|
text_parts: list[str] = []
|
||||||
|
images: list[bytes] = []
|
||||||
|
for part in content:
|
||||||
|
if not isinstance(part, dict):
|
||||||
|
continue
|
||||||
|
part_type = part.get("type")
|
||||||
|
if part_type == "text":
|
||||||
|
text = part.get("text")
|
||||||
|
if text:
|
||||||
|
text_parts.append(str(text))
|
||||||
|
elif part_type == "image_url":
|
||||||
|
url = (part.get("image_url") or {}).get("url", "")
|
||||||
|
if isinstance(url, str) and url.startswith("data:"):
|
||||||
|
try:
|
||||||
|
encoded = url.split(",", 1)[1]
|
||||||
|
images.append(base64.b64decode(encoded, validate=True))
|
||||||
|
except (ValueError, IndexError, base64.binascii.Error) as e:
|
||||||
|
logger.debug("Failed to decode multimodal image url: %s", e)
|
||||||
|
|
||||||
|
return ("\n".join(text_parts) if text_parts else None), (images or None)
|
||||||
|
|
||||||
|
|
||||||
@register_genai_provider(GenAIProviderEnum.ollama)
|
@register_genai_provider(GenAIProviderEnum.ollama)
|
||||||
class OllamaClient(GenAIClient):
|
class OllamaClient(GenAIClient):
|
||||||
"""Generative AI client for Frigate using Ollama."""
|
"""Generative AI client for Frigate using Ollama."""
|
||||||
@ -207,10 +243,13 @@ class OllamaClient(GenAIClient):
|
|||||||
"""Build request_messages and params for chat (sync or stream)."""
|
"""Build request_messages and params for chat (sync or stream)."""
|
||||||
request_messages = []
|
request_messages = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
msg_dict = {
|
content, images = _normalize_multimodal_content(msg.get("content", ""))
|
||||||
|
msg_dict: dict[str, Any] = {
|
||||||
"role": msg.get("role"),
|
"role": msg.get("role"),
|
||||||
"content": msg.get("content", ""),
|
"content": content if content is not None else "",
|
||||||
}
|
}
|
||||||
|
if images:
|
||||||
|
msg_dict["images"] = images
|
||||||
if msg.get("tool_call_id"):
|
if msg.get("tool_call_id"):
|
||||||
msg_dict["tool_call_id"] = msg["tool_call_id"]
|
msg_dict["tool_call_id"] = msg["tool_call_id"]
|
||||||
if msg.get("name"):
|
if msg.get("name"):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user