Improve handling of images in request

This commit is contained in:
Nicolas Mowen 2026-01-22 08:59:30 -07:00
parent 6569f27c26
commit 7e14b8a42e
3 changed files with 86 additions and 18 deletions

View File

@ -4,7 +4,7 @@ import base64
import json import json
import logging import logging
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any, Dict, List from typing import Any, Dict, List, Optional
import cv2 import cv2
from fastapi import APIRouter, Body, Depends, Request from fastapi import APIRouter, Body, Depends, Request
@ -94,10 +94,10 @@ def get_tool_definitions() -> List[Dict[str, Any]]:
"function": { "function": {
"name": "get_live_context", "name": "get_live_context",
"description": ( "description": (
"Get the current live view and detection information for a camera. " "Get the current detection information for a camera: objects being tracked, "
"Returns the current camera frame as a base64-encoded image along with " "zones, timestamps. Use this to understand what is visible in the live view. "
"information about objects currently being tracked/detected on the camera. " "Call this when the user has included a live image (via include_live_image) or "
"Use this to answer questions about what is happening right now on a specific camera." "when answering questions about what is happening right now on a specific camera."
), ),
"parameters": { "parameters": {
"type": "object", "type": "object",
@ -255,16 +255,6 @@ async def _execute_get_live_context(
"error": f"Camera '{camera}' state not available", "error": f"Camera '{camera}' state not available",
} }
frame = frame_processor.get_current_frame(camera, {})
if frame is None:
return {
"error": f"Unable to get current frame for camera '{camera}'",
}
_, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
image_base64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
image_data_url = f"data:image/jpeg;base64,{image_base64}"
tracked_objects_dict = {} tracked_objects_dict = {}
with camera_state.current_frame_lock: with camera_state.current_frame_lock:
tracked_objects = camera_state.tracked_objects.copy() tracked_objects = camera_state.tracked_objects.copy()
@ -283,7 +273,6 @@ async def _execute_get_live_context(
return { return {
"camera": camera, "camera": camera,
"timestamp": frame_time, "timestamp": frame_time,
"image": image_data_url,
"detections": list(tracked_objects_dict.values()), "detections": list(tracked_objects_dict.values()),
} }
@ -294,6 +283,46 @@ async def _execute_get_live_context(
} }
async def _get_live_frame_image_url(
request: Request,
camera: str,
allowed_cameras: List[str],
) -> Optional[str]:
"""
Fetch the current live frame for a camera as a base64 data URL.
Returns None if the frame cannot be retrieved. Used when include_live_image
is set to attach the image to the first user message.
"""
if (
camera not in allowed_cameras
or camera not in request.app.frigate_config.cameras
):
return None
try:
frame_processor = request.app.detected_frames_processor
if camera not in frame_processor.camera_states:
return None
frame = frame_processor.get_current_frame(camera, {})
if frame is None:
return None
height, width = frame.shape[:2]
max_dimension = 1024
if height > max_dimension or width > max_dimension:
scale = max_dimension / max(height, width)
frame = cv2.resize(
frame,
(int(width * scale), int(height * scale)),
interpolation=cv2.INTER_AREA,
)
_, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
b64 = base64.b64encode(img_encoded.tobytes()).decode("utf-8")
return f"data:image/jpeg;base64,{b64}"
except Exception as e:
logger.debug("Failed to get live frame for %s: %s", camera, e)
return None
async def _execute_tool_internal( async def _execute_tool_internal(
tool_name: str, tool_name: str,
arguments: Dict[str, Any], arguments: Dict[str, Any],
@ -391,13 +420,21 @@ async def chat_completion(
+ "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls." + "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls."
) )
live_image_note = ""
if body.include_live_image:
live_image_note = (
f"\n\nThe first user message includes a live image from camera "
f"'{body.include_live_image}'. Use get_live_context for that camera to get "
"current detection details (objects, zones) to aid in understanding the image."
)
system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events. system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
Current date and time: {current_date_str} at {current_time_str} (UTC) Current date and time: {current_date_str} at {current_time_str} (UTC)
When users ask questions about "today", "yesterday", "this week", etc., use the current date above as reference. When users ask questions about "today", "yesterday", "this week", etc., use the current date above as reference.
When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today). When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
Always be accurate with time calculations based on the current date provided.{cameras_section}""" Always be accurate with time calculations based on the current date provided.{cameras_section}{live_image_note}"""
conversation.append( conversation.append(
{ {
@ -406,6 +443,7 @@ Always be accurate with time calculations based on the current date provided.{ca
} }
) )
first_user_message_seen = False
for msg in body.messages: for msg in body.messages:
msg_dict = { msg_dict = {
"role": msg.role, "role": msg.role,
@ -415,6 +453,22 @@ Always be accurate with time calculations based on the current date provided.{ca
msg_dict["tool_call_id"] = msg.tool_call_id msg_dict["tool_call_id"] = msg.tool_call_id
if msg.name: if msg.name:
msg_dict["name"] = msg.name msg_dict["name"] = msg.name
if (
msg.role == "user"
and not first_user_message_seen
and body.include_live_image
):
first_user_message_seen = True
image_url = await _get_live_frame_image_url(
request, body.include_live_image, allowed_cameras
)
if image_url:
msg_dict["content"] = [
{"type": "text", "text": msg.content},
{"type": "image_url", "image_url": {"url": image_url}},
]
conversation.append(msg_dict) conversation.append(msg_dict)
tool_iterations = 0 tool_iterations = 0

View File

@ -32,3 +32,10 @@ class ChatCompletionRequest(BaseModel):
le=10, le=10,
description="Maximum number of tool call iterations (default: 5)", description="Maximum number of tool call iterations (default: 5)",
) )
include_live_image: Optional[str] = Field(
default=None,
description=(
"If set, the current live frame from this camera is attached to the first "
"user message as multimodal content. Use with get_live_context for detection info."
),
)

View File

@ -216,7 +216,14 @@ class LlamaCppClient(GenAIClient):
"finish_reason": "error", "finish_reason": "error",
} }
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.warning("llama.cpp returned an error: %s", str(e)) error_detail = str(e)
if hasattr(e, "response") and e.response is not None:
try:
error_body = e.response.text
error_detail = f"{str(e)} - Response: {error_body[:500]}"
except Exception:
pass
logger.warning("llama.cpp returned an error: %s", error_detail)
return { return {
"content": None, "content": None,
"tool_calls": None, "tool_calls": None,