From 5fdb56a10632b61dd94e087425753c0305369a9f Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Thu, 22 Jan 2026 12:04:40 -0700 Subject: [PATCH] Add live context tool to LLM (#21754) * Add live context tool * Improve handling of images in request * Improve prompt caching --- frigate/api/chat.py | 172 +++++++++++++++++++++++++- frigate/api/defs/request/chat_body.py | 7 ++ frigate/genai/__init__.py | 5 +- frigate/genai/llama_cpp.py | 9 +- 4 files changed, 188 insertions(+), 5 deletions(-) diff --git a/frigate/api/chat.py b/frigate/api/chat.py index eeff3ab6d..444650e13 100644 --- a/frigate/api/chat.py +++ b/frigate/api/chat.py @@ -1,10 +1,12 @@ """Chat and LLM tool calling APIs.""" +import base64 import json import logging from datetime import datetime, timezone -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional +import cv2 from fastapi import APIRouter, Body, Depends, Request from fastapi.responses import JSONResponse from pydantic import BaseModel @@ -87,6 +89,28 @@ def get_tool_definitions() -> List[Dict[str, Any]]: "required": [], }, }, + { + "type": "function", + "function": { + "name": "get_live_context", + "description": ( + "Get the current detection information for a camera: objects being tracked, " + "zones, timestamps. Use this to understand what is visible in the live view. " + "Call this when the user has included a live image (via include_live_image) or " + "when answering questions about what is happening right now on a specific camera." + ), + "parameters": { + "type": "object", + "properties": { + "camera": { + "type": "string", + "description": "Camera name to get live context for.", + }, + }, + "required": ["camera"], + }, + }, + }, ] @@ -207,6 +231,98 @@ async def execute_tool( ) +async def _execute_get_live_context( + request: Request, + camera: str, + allowed_cameras: List[str], +) -> Dict[str, Any]: + if camera not in allowed_cameras: + return { + "error": f"Camera '{camera}' not found or access denied", + } + + if camera not in request.app.frigate_config.cameras: + return { + "error": f"Camera '{camera}' not found", + } + + try: + frame_processor = request.app.detected_frames_processor + camera_state = frame_processor.camera_states.get(camera) + + if camera_state is None: + return { + "error": f"Camera '{camera}' state not available", + } + + tracked_objects_dict = {} + with camera_state.current_frame_lock: + tracked_objects = camera_state.tracked_objects.copy() + frame_time = camera_state.current_frame_time + + for obj_id, tracked_obj in tracked_objects.items(): + obj_dict = tracked_obj.to_dict() + if obj_dict.get("frame_time") == frame_time: + tracked_objects_dict[obj_id] = { + "label": obj_dict.get("label"), + "zones": obj_dict.get("current_zones", []), + "sub_label": obj_dict.get("sub_label"), + "stationary": obj_dict.get("stationary", False), + } + + return { + "camera": camera, + "timestamp": frame_time, + "detections": list(tracked_objects_dict.values()), + } + + except Exception as e: + logger.error(f"Error executing get_live_context: {e}", exc_info=True) + return { + "error": f"Error getting live context: {str(e)}", + } + + +async def _get_live_frame_image_url( + request: Request, + camera: str, + allowed_cameras: List[str], +) -> Optional[str]: + """ + Fetch the current live frame for a camera as a base64 data URL. + + Returns None if the frame cannot be retrieved. Used when include_live_image + is set to attach the image to the first user message. + """ + if ( + camera not in allowed_cameras + or camera not in request.app.frigate_config.cameras + ): + return None + try: + frame_processor = request.app.detected_frames_processor + if camera not in frame_processor.camera_states: + return None + frame = frame_processor.get_current_frame(camera, {}) + if frame is None: + return None + height, width = frame.shape[:2] + max_dimension = 1024 + if height > max_dimension or width > max_dimension: + scale = max_dimension / max(height, width) + frame = cv2.resize( + frame, + (int(width * scale), int(height * scale)), + interpolation=cv2.INTER_AREA, + ) + _, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85]) + b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8") + return f"data:image/jpeg;base64,{b64}" + except Exception as e: + logger.debug("Failed to get live frame for %s: %s", camera, e) + return None + + async def _execute_tool_internal( tool_name: str, arguments: Dict[str, Any], @@ -231,6 +347,11 @@ async def _execute_tool_internal( except (json.JSONDecodeError, AttributeError) as e: logger.warning(f"Failed to extract tool result: {e}") return {"error": "Failed to parse tool result"} + elif tool_name == "get_live_context": + camera = arguments.get("camera") + if not camera: + return {"error": "Camera parameter is required"} + return await _execute_get_live_context(request, camera, allowed_cameras) else: return {"error": f"Unknown tool: {tool_name}"} @@ -277,13 +398,43 @@ async def chat_completion( current_datetime = datetime.now(timezone.utc) current_date_str = current_datetime.strftime("%Y-%m-%d") current_time_str = current_datetime.strftime("%H:%M:%S %Z") + + cameras_info = [] + config = request.app.frigate_config + for camera_id in allowed_cameras: + if camera_id not in config.cameras: + continue + camera_config = config.cameras[camera_id] + friendly_name = ( + camera_config.friendly_name + if camera_config.friendly_name + else camera_id.replace("_", " ").title() + ) + cameras_info.append(f" - {friendly_name} (ID: {camera_id})") + + cameras_section = "" + if cameras_info: + cameras_section = ( + "\n\nAvailable cameras:\n" + + "\n".join(cameras_info) + + "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls." + ) + + live_image_note = "" + if body.include_live_image: + live_image_note = ( + f"\n\nThe first user message includes a live image from camera " + f"'{body.include_live_image}'. Use get_live_context for that camera to get " + "current detection details (objects, zones) to aid in understanding the image." + ) + system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events. Current date and time: {current_date_str} at {current_time_str} (UTC) When users ask questions about "today", "yesterday", "this week", etc., use the current date above as reference. When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today). -Always be accurate with time calculations based on the current date provided.""" +Always be accurate with time calculations based on the current date provided.{cameras_section}{live_image_note}""" conversation.append( { @@ -292,6 +443,7 @@ Always be accurate with time calculations based on the current date provided.""" } ) + first_user_message_seen = False for msg in body.messages: msg_dict = { "role": msg.role, @@ -301,6 +453,22 @@ Always be accurate with time calculations based on the current date provided.""" msg_dict["tool_call_id"] = msg.tool_call_id if msg.name: msg_dict["name"] = msg.name + + if ( + msg.role == "user" + and not first_user_message_seen + and body.include_live_image + ): + first_user_message_seen = True + image_url = await _get_live_frame_image_url( + request, body.include_live_image, allowed_cameras + ) + if image_url: + msg_dict["content"] = [ + {"type": "text", "text": msg.content}, + {"type": "image_url", "image_url": {"url": image_url}}, + ] + conversation.append(msg_dict) tool_iterations = 0 diff --git a/frigate/api/defs/request/chat_body.py b/frigate/api/defs/request/chat_body.py index 7b327bf5a..fa3c3860a 100644 --- a/frigate/api/defs/request/chat_body.py +++ b/frigate/api/defs/request/chat_body.py @@ -32,3 +32,10 @@ class ChatCompletionRequest(BaseModel): le=10, description="Maximum number of tool call iterations (default: 5)", ) + include_live_image: Optional[str] = Field( + default=None, + description=( + "If set, the current live frame from this camera is attached to the first " + "user message as multimodal content. Use with get_live_context for detection info." + ), + ) diff --git a/frigate/genai/__init__.py b/frigate/genai/__init__.py index 4be75f418..fa418f101 100644 --- a/frigate/genai/__init__.py +++ b/frigate/genai/__init__.py @@ -69,7 +69,7 @@ class GenAIClient: return "\n- (No objects detected)" context_prompt = f""" -Your task is to analyze the sequence of images ({len(thumbnails)} total) taken in chronological order from the perspective of the {review_data["camera"]} security camera. +Your task is to analyze a sequence of images taken in chronological order from a security camera. ## Normal Activity Patterns for This Property @@ -107,7 +107,8 @@ Your response MUST be a flat JSON object with: ## Sequence Details -- Frame 1 = earliest, Frame {len(thumbnails)} = latest +- Camera: {review_data["camera"]} +- Total frames: {len(thumbnails)} (Frame 1 = earliest, Frame {len(thumbnails)} = latest) - Activity started at {review_data["start"]} and lasted {review_data["duration"]} seconds - Zones involved: {", ".join(review_data["zones"]) if review_data["zones"] else "None"} diff --git a/frigate/genai/llama_cpp.py b/frigate/genai/llama_cpp.py index 5523ce389..fafef74ae 100644 --- a/frigate/genai/llama_cpp.py +++ b/frigate/genai/llama_cpp.py @@ -216,7 +216,14 @@ class LlamaCppClient(GenAIClient): "finish_reason": "error", } except requests.exceptions.RequestException as e: - logger.warning("llama.cpp returned an error: %s", str(e)) + error_detail = str(e) + if hasattr(e, "response") and e.response is not None: + try: + error_body = e.response.text + error_detail = f"{str(e)} - Response: {error_body[:500]}" + except Exception: + pass + logger.warning("llama.cpp returned an error: %s", error_detail) return { "content": None, "tool_calls": None,