From 7e14b8a42e4b1f24e0cb279bef9ff3ca9eddc47b Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Thu, 22 Jan 2026 08:59:30 -0700 Subject: [PATCH] Improve handling of images in request --- frigate/api/chat.py | 88 +++++++++++++++++++++------ frigate/api/defs/request/chat_body.py | 7 +++ frigate/genai/llama_cpp.py | 9 ++- 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/frigate/api/chat.py b/frigate/api/chat.py index ada7c91c3..444650e13 100644 --- a/frigate/api/chat.py +++ b/frigate/api/chat.py @@ -4,7 +4,7 @@ import base64 import json import logging from datetime import datetime, timezone -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import cv2 from fastapi import APIRouter, Body, Depends, Request @@ -94,10 +94,10 @@ def get_tool_definitions() -> List[Dict[str, Any]]: "function": { "name": "get_live_context", "description": ( - "Get the current live view and detection information for a camera. " - "Returns the current camera frame as a base64-encoded image along with " - "information about objects currently being tracked/detected on the camera. " - "Use this to answer questions about what is happening right now on a specific camera." + "Get the current detection information for a camera: objects being tracked, " + "zones, timestamps. Use this to understand what is visible in the live view. " + "Call this when the user has included a live image (via include_live_image) or " + "when answering questions about what is happening right now on a specific camera." ), "parameters": { "type": "object", @@ -255,16 +255,6 @@ async def _execute_get_live_context( "error": f"Camera '{camera}' state not available", } - frame = frame_processor.get_current_frame(camera, {}) - if frame is None: - return { - "error": f"Unable to get current frame for camera '{camera}'", - } - - _, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85]) - image_base64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8") - image_data_url = f"data:image/jpeg;base64,{image_base64}" - tracked_objects_dict = {} with camera_state.current_frame_lock: tracked_objects = camera_state.tracked_objects.copy() @@ -283,7 +273,6 @@ async def _execute_get_live_context( return { "camera": camera, "timestamp": frame_time, - "image": image_data_url, "detections": list(tracked_objects_dict.values()), } @@ -294,6 +283,46 @@ async def _execute_get_live_context( } +async def _get_live_frame_image_url( + request: Request, + camera: str, + allowed_cameras: List[str], +) -> Optional[str]: + """ + Fetch the current live frame for a camera as a base64 data URL. + + Returns None if the frame cannot be retrieved. Used when include_live_image + is set to attach the image to the first user message. + """ + if ( + camera not in allowed_cameras + or camera not in request.app.frigate_config.cameras + ): + return None + try: + frame_processor = request.app.detected_frames_processor + if camera not in frame_processor.camera_states: + return None + frame = frame_processor.get_current_frame(camera, {}) + if frame is None: + return None + height, width = frame.shape[:2] + max_dimension = 1024 + if height > max_dimension or width > max_dimension: + scale = max_dimension / max(height, width) + frame = cv2.resize( + frame, + (int(width * scale), int(height * scale)), + interpolation=cv2.INTER_AREA, + ) + _, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85]) + b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8") + return f"data:image/jpeg;base64,{b64}" + except Exception as e: + logger.debug("Failed to get live frame for %s: %s", camera, e) + return None + + async def _execute_tool_internal( tool_name: str, arguments: Dict[str, Any], @@ -391,13 +420,21 @@ async def chat_completion( + "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls." ) + live_image_note = "" + if body.include_live_image: + live_image_note = ( + f"\n\nThe first user message includes a live image from camera " + f"'{body.include_live_image}'. Use get_live_context for that camera to get " + "current detection details (objects, zones) to aid in understanding the image." + ) + system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events. Current date and time: {current_date_str} at {current_time_str} (UTC) When users ask questions about "today", "yesterday", "this week", etc., use the current date above as reference. When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today). -Always be accurate with time calculations based on the current date provided.{cameras_section}""" +Always be accurate with time calculations based on the current date provided.{cameras_section}{live_image_note}""" conversation.append( { @@ -406,6 +443,7 @@ Always be accurate with time calculations based on the current date provided.{ca } ) + first_user_message_seen = False for msg in body.messages: msg_dict = { "role": msg.role, @@ -415,6 +453,22 @@ Always be accurate with time calculations based on the current date provided.{ca msg_dict["tool_call_id"] = msg.tool_call_id if msg.name: msg_dict["name"] = msg.name + + if ( + msg.role == "user" + and not first_user_message_seen + and body.include_live_image + ): + first_user_message_seen = True + image_url = await _get_live_frame_image_url( + request, body.include_live_image, allowed_cameras + ) + if image_url: + msg_dict["content"] = [ + {"type": "text", "text": msg.content}, + {"type": "image_url", "image_url": {"url": image_url}}, + ] + conversation.append(msg_dict) tool_iterations = 0 diff --git a/frigate/api/defs/request/chat_body.py b/frigate/api/defs/request/chat_body.py index 7b327bf5a..fa3c3860a 100644 --- a/frigate/api/defs/request/chat_body.py +++ b/frigate/api/defs/request/chat_body.py @@ -32,3 +32,10 @@ class ChatCompletionRequest(BaseModel): le=10, description="Maximum number of tool call iterations (default: 5)", ) + include_live_image: Optional[str] = Field( + default=None, + description=( + "If set, the current live frame from this camera is attached to the first " + "user message as multimodal content. Use with get_live_context for detection info." + ), + ) diff --git a/frigate/genai/llama_cpp.py b/frigate/genai/llama_cpp.py index 5523ce389..fafef74ae 100644 --- a/frigate/genai/llama_cpp.py +++ b/frigate/genai/llama_cpp.py @@ -216,7 +216,14 @@ class LlamaCppClient(GenAIClient): "finish_reason": "error", } except requests.exceptions.RequestException as e: - logger.warning("llama.cpp returned an error: %s", str(e)) + error_detail = str(e) + if hasattr(e, "response") and e.response is not None: + try: + error_body = e.response.text + error_detail = f"{str(e)} - Response: {error_body[:500]}" + except Exception: + pass + logger.warning("llama.cpp returned an error: %s", error_detail) return { "content": None, "tool_calls": None,