Improve handling of images in request

2026-04-10 00:57:38 +03:00 · 2026-01-22 08:59:30 -07:00 · 2026-01-22 08:59:30 -07:00 · 7e14b8a42e
commit 7e14b8a42e
parent 6569f27c26
3 changed files with 86 additions and 18 deletions
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@ -4,7 +4,7 @@ import base64
 import json
 import logging
 from datetime import datetime, timezone
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 import cv2
 from fastapi import APIRouter, Body, Depends, Request
@ -94,10 +94,10 @@ def get_tool_definitions() -> List[Dict[str, Any]]:
            "function": {
                "name": "get_live_context",
                "description": (
-                    "Get the current live view and detection information for a camera. "
+                    "Get the current detection information for a camera: objects being tracked, "
-                    "Returns the current camera frame as a base64-encoded image along with "
+                    "zones, timestamps. Use this to understand what is visible in the live view. "
-                    "information about objects currently being tracked/detected on the camera. "
+                    "Call this when the user has included a live image (via include_live_image) or "
-                    "Use this to answer questions about what is happening right now on a specific camera."
+                    "when answering questions about what is happening right now on a specific camera."
                ),
                "parameters": {
                    "type": "object",
@ -255,16 +255,6 @@ async def _execute_get_live_context(
                "error": f"Camera '{camera}' state not available",
            }
        frame = frame_processor.get_current_frame(camera, {})
        if frame is None:
            return {
                "error": f"Unable to get current frame for camera '{camera}'",
            }
        _, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
        image_base64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
        image_data_url = f"data:image/jpeg;base64,{image_base64}"
        tracked_objects_dict = {}
        with camera_state.current_frame_lock:
            tracked_objects = camera_state.tracked_objects.copy()
@ -283,7 +273,6 @@ async def _execute_get_live_context(
        return {
            "camera": camera,
            "timestamp": frame_time,
            "image": image_data_url,
            "detections": list(tracked_objects_dict.values()),
        }
@ -294,6 +283,46 @@ async def _execute_get_live_context(
        }
 async def _get_live_frame_image_url(
    request: Request,
    camera: str,
    allowed_cameras: List[str],
 ) -> Optional[str]:
    """
    Fetch the current live frame for a camera as a base64 data URL.
    Returns None if the frame cannot be retrieved. Used when include_live_image
    is set to attach the image to the first user message.
    """
    if (
        camera not in allowed_cameras
        or camera not in request.app.frigate_config.cameras
    ):
        return None
    try:
        frame_processor = request.app.detected_frames_processor
        if camera not in frame_processor.camera_states:
            return None
        frame = frame_processor.get_current_frame(camera, {})
        if frame is None:
            return None
        height, width = frame.shape[:2]
        max_dimension = 1024
        if height > max_dimension or width > max_dimension:
            scale = max_dimension / max(height, width)
            frame = cv2.resize(
                frame,
                (int(width * scale), int(height * scale)),
                interpolation=cv2.INTER_AREA,
            )
        _, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
        b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
        return f"data:image/jpeg;base64,{b64}"
    except Exception as e:
        logger.debug("Failed to get live frame for %s: %s", camera, e)
        return None
 async def _execute_tool_internal(
    tool_name: str,
    arguments: Dict[str, Any],
@ -391,13 +420,21 @@ async def chat_completion(
            + "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls."
        )
    live_image_note = ""
    if body.include_live_image:
        live_image_note = (
            f"\n\nThe first user message includes a live image from camera "
            f"'{body.include_live_image}'. Use get_live_context for that camera to get "
            "current detection details (objects, zones) to aid in understanding the image."
        )
    system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
 Current date and time: {current_date_str} at {current_time_str} (UTC)
 When users ask questions about "today", "yesterday", "this week", etc., use the current date above as reference.
 When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
-Always be accurate with time calculations based on the current date provided.{cameras_section}"""
+Always be accurate with time calculations based on the current date provided.{cameras_section}{live_image_note}"""
    conversation.append(
        {
@ -406,6 +443,7 @@ Always be accurate with time calculations based on the current date provided.{ca
        }
    )
    first_user_message_seen = False
    for msg in body.messages:
        msg_dict = {
            "role": msg.role,
@ -415,6 +453,22 @@ Always be accurate with time calculations based on the current date provided.{ca
            msg_dict["tool_call_id"] = msg.tool_call_id
        if msg.name:
            msg_dict["name"] = msg.name
        if (
            msg.role == "user"
            and not first_user_message_seen
            and body.include_live_image
        ):
            first_user_message_seen = True
            image_url = await _get_live_frame_image_url(
                request, body.include_live_image, allowed_cameras
            )
            if image_url:
                msg_dict["content"] = [
                    {"type": "text", "text": msg.content},
                    {"type": "image_url", "image_url": {"url": image_url}},
                ]
        conversation.append(msg_dict)
    tool_iterations = 0
--- a/frigate/api/defs/request/chat_body.py
+++ b/frigate/api/defs/request/chat_body.py
@ -32,3 +32,10 @@ class ChatCompletionRequest(BaseModel):
        le=10,
        description="Maximum number of tool call iterations (default: 5)",
    )
    include_live_image: Optional[str] = Field(
        default=None,
        description=(
            "If set, the current live frame from this camera is attached to the first "
            "user message as multimodal content. Use with get_live_context for detection info."
        ),
    )
--- a/frigate/genai/llama_cpp.py
+++ b/frigate/genai/llama_cpp.py
@ -216,7 +216,14 @@ class LlamaCppClient(GenAIClient):
                "finish_reason": "error",
            }
        except requests.exceptions.RequestException as e:
-            logger.warning("llama.cpp returned an error: %s", str(e))
+            error_detail = str(e)
            if hasattr(e, "response") and e.response is not None:
                try:
                    error_body = e.response.text
                    error_detail = f"{str(e)} - Response: {error_body[:500]}"
                except Exception:
                    pass
            logger.warning("llama.cpp returned an error: %s", error_detail)
            return {
                "content": None,
                "tool_calls": None,