mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-03-26 10:08:22 +03:00
Dynamic vision chat (#22551)
* Improve live context image handling * Improve chat handling * Cleanup
This commit is contained in:
parent
34a06ac77b
commit
373bcadef8
@ -201,10 +201,9 @@ def get_tool_definitions() -> List[Dict[str, Any]]:
|
|||||||
"function": {
|
"function": {
|
||||||
"name": "get_live_context",
|
"name": "get_live_context",
|
||||||
"description": (
|
"description": (
|
||||||
"Get the current detection information for a camera: objects being tracked, "
|
"Get the current live image and detection information for a camera: objects being tracked, "
|
||||||
"zones, timestamps. Use this to understand what is visible in the live view. "
|
"zones, timestamps. Use this to understand what is visible in the live view. "
|
||||||
"Call this when the user has included a live image (via include_live_image) or "
|
"Call this when answering questions about what is happening right now on a specific camera."
|
||||||
"when answering questions about what is happening right now on a specific camera."
|
|
||||||
),
|
),
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -384,12 +383,54 @@ async def _execute_get_live_context(
|
|||||||
"stationary": obj_dict.get("stationary", False),
|
"stationary": obj_dict.get("stationary", False),
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
result: Dict[str, Any] = {
|
||||||
"camera": camera,
|
"camera": camera,
|
||||||
"timestamp": frame_time,
|
"timestamp": frame_time,
|
||||||
"detections": list(tracked_objects_dict.values()),
|
"detections": list(tracked_objects_dict.values()),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Grab live frame and handle based on provider configuration
|
||||||
|
image_url = await _get_live_frame_image_url(request, camera, allowed_cameras)
|
||||||
|
if image_url:
|
||||||
|
genai_manager = request.app.genai_manager
|
||||||
|
if genai_manager.tool_client is genai_manager.vision_client:
|
||||||
|
# Same provider handles both roles — pass image URL so it can
|
||||||
|
# be injected as a user message (images can't be in tool results)
|
||||||
|
result["_image_url"] = image_url
|
||||||
|
elif genai_manager.vision_client is not None:
|
||||||
|
# Separate vision provider — have it describe the image,
|
||||||
|
# providing detection context so it knows what to focus on
|
||||||
|
frame_bytes = _decode_data_url(image_url)
|
||||||
|
if frame_bytes:
|
||||||
|
detections = result.get("detections", [])
|
||||||
|
if detections:
|
||||||
|
detection_lines = []
|
||||||
|
for d in detections:
|
||||||
|
parts = [d.get("label", "unknown")]
|
||||||
|
if d.get("sub_label"):
|
||||||
|
parts.append(f"({d['sub_label']})")
|
||||||
|
if d.get("zones"):
|
||||||
|
parts.append(f"in {', '.join(d['zones'])}")
|
||||||
|
detection_lines.append(" ".join(parts))
|
||||||
|
context = (
|
||||||
|
"The following objects are currently being tracked: "
|
||||||
|
+ "; ".join(detection_lines)
|
||||||
|
+ "."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
context = "No objects are currently being tracked."
|
||||||
|
|
||||||
|
description = genai_manager.vision_client._send(
|
||||||
|
f"Describe what you see in this security camera image. "
|
||||||
|
f"{context} Focus on the scene, any visible activity, "
|
||||||
|
f"and details about the tracked objects.",
|
||||||
|
[frame_bytes],
|
||||||
|
)
|
||||||
|
if description:
|
||||||
|
result["image_description"] = description
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error executing get_live_context: {e}", exc_info=True)
|
logger.error(f"Error executing get_live_context: {e}", exc_info=True)
|
||||||
return {
|
return {
|
||||||
@ -405,8 +446,8 @@ async def _get_live_frame_image_url(
|
|||||||
"""
|
"""
|
||||||
Fetch the current live frame for a camera as a base64 data URL.
|
Fetch the current live frame for a camera as a base64 data URL.
|
||||||
|
|
||||||
Returns None if the frame cannot be retrieved. Used when include_live_image
|
Returns None if the frame cannot be retrieved. Used by get_live_context
|
||||||
is set to attach the image to the first user message.
|
to attach the live image to the conversation.
|
||||||
"""
|
"""
|
||||||
if (
|
if (
|
||||||
camera not in allowed_cameras
|
camera not in allowed_cameras
|
||||||
@ -421,12 +462,12 @@ async def _get_live_frame_image_url(
|
|||||||
if frame is None:
|
if frame is None:
|
||||||
return None
|
return None
|
||||||
height, width = frame.shape[:2]
|
height, width = frame.shape[:2]
|
||||||
max_dimension = 1024
|
target_height = 480
|
||||||
if height > max_dimension or width > max_dimension:
|
if height > target_height:
|
||||||
scale = max_dimension / max(height, width)
|
scale = target_height / height
|
||||||
frame = cv2.resize(
|
frame = cv2.resize(
|
||||||
frame,
|
frame,
|
||||||
(int(width * scale), int(height * scale)),
|
(int(width * scale), target_height),
|
||||||
interpolation=cv2.INTER_AREA,
|
interpolation=cv2.INTER_AREA,
|
||||||
)
|
)
|
||||||
_, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
_, img_encoded = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||||
@ -437,6 +478,17 @@ async def _get_live_frame_image_url(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_data_url(data_url: str) -> Optional[bytes]:
|
||||||
|
"""Decode a base64 data URL to raw bytes."""
|
||||||
|
try:
|
||||||
|
# Format: data:image/jpeg;base64,<data>
|
||||||
|
_, encoded = data_url.split(",", 1)
|
||||||
|
return base64.b64decode(encoded)
|
||||||
|
except (ValueError, Exception) as e:
|
||||||
|
logger.debug("Failed to decode data URL: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def _execute_set_camera_state(
|
async def _execute_set_camera_state(
|
||||||
request: Request,
|
request: Request,
|
||||||
arguments: Dict[str, Any],
|
arguments: Dict[str, Any],
|
||||||
@ -527,12 +579,18 @@ async def _execute_pending_tools(
|
|||||||
pending_tool_calls: List[Dict[str, Any]],
|
pending_tool_calls: List[Dict[str, Any]],
|
||||||
request: Request,
|
request: Request,
|
||||||
allowed_cameras: List[str],
|
allowed_cameras: List[str],
|
||||||
) -> tuple[List[ToolCall], List[Dict[str, Any]]]:
|
) -> tuple[List[ToolCall], List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||||
"""
|
"""
|
||||||
Execute a list of tool calls; return (ToolCall list for API response, tool result dicts for conversation).
|
Execute a list of tool calls.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(ToolCall list for API response,
|
||||||
|
tool result dicts for conversation,
|
||||||
|
extra messages to inject after tool results — e.g. user messages with images)
|
||||||
"""
|
"""
|
||||||
tool_calls_out: List[ToolCall] = []
|
tool_calls_out: List[ToolCall] = []
|
||||||
tool_results: List[Dict[str, Any]] = []
|
tool_results: List[Dict[str, Any]] = []
|
||||||
|
extra_messages: List[Dict[str, Any]] = []
|
||||||
for tool_call in pending_tool_calls:
|
for tool_call in pending_tool_calls:
|
||||||
tool_name = tool_call["name"]
|
tool_name = tool_call["name"]
|
||||||
tool_args = tool_call.get("arguments") or {}
|
tool_args = tool_call.get("arguments") or {}
|
||||||
@ -569,6 +627,27 @@ async def _execute_pending_tools(
|
|||||||
for evt in tool_result
|
for evt in tool_result
|
||||||
if isinstance(evt, dict)
|
if isinstance(evt, dict)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Extract _image_url from get_live_context results — images can
|
||||||
|
# only be sent in user messages, not tool results
|
||||||
|
if isinstance(tool_result, dict) and "_image_url" in tool_result:
|
||||||
|
image_url = tool_result.pop("_image_url")
|
||||||
|
extra_messages.append(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": f"Here is the current live image from camera '{tool_result.get('camera', 'unknown')}'.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": image_url},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
result_content = (
|
result_content = (
|
||||||
json.dumps(tool_result)
|
json.dumps(tool_result)
|
||||||
if isinstance(tool_result, (dict, list))
|
if isinstance(tool_result, (dict, list))
|
||||||
@ -604,7 +683,7 @@ async def _execute_pending_tools(
|
|||||||
"content": error_content,
|
"content": error_content,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return (tool_calls_out, tool_results)
|
return (tool_calls_out, tool_results, extra_messages)
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
@ -660,7 +739,13 @@ async def chat_completion(
|
|||||||
if camera_config.friendly_name
|
if camera_config.friendly_name
|
||||||
else camera_id.replace("_", " ").title()
|
else camera_id.replace("_", " ").title()
|
||||||
)
|
)
|
||||||
cameras_info.append(f" - {friendly_name} (ID: {camera_id})")
|
zone_names = list(camera_config.zones.keys())
|
||||||
|
if zone_names:
|
||||||
|
cameras_info.append(
|
||||||
|
f" - {friendly_name} (ID: {camera_id}, zones: {', '.join(zone_names)})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cameras_info.append(f" - {friendly_name} (ID: {camera_id})")
|
||||||
|
|
||||||
cameras_section = ""
|
cameras_section = ""
|
||||||
if cameras_info:
|
if cameras_info:
|
||||||
@ -670,14 +755,6 @@ async def chat_completion(
|
|||||||
+ "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls."
|
+ "\n\nWhen users refer to cameras by their friendly name (e.g., 'Back Deck Camera'), use the corresponding camera ID (e.g., 'back_deck_cam') in tool calls."
|
||||||
)
|
)
|
||||||
|
|
||||||
live_image_note = ""
|
|
||||||
if body.include_live_image:
|
|
||||||
live_image_note = (
|
|
||||||
f"\n\nThe first user message includes a live image from camera "
|
|
||||||
f"'{body.include_live_image}'. Use get_live_context for that camera to get "
|
|
||||||
"current detection details (objects, zones) to aid in understanding the image."
|
|
||||||
)
|
|
||||||
|
|
||||||
system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
|
system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
|
||||||
|
|
||||||
Current server local date and time: {current_date_str} at {current_time_str}
|
Current server local date and time: {current_date_str} at {current_time_str}
|
||||||
@ -687,7 +764,7 @@ Do not start your response with phrases like "I will check...", "Let me see...",
|
|||||||
Always present times to the user in the server's local timezone. When tool results include start_time_local and end_time_local, use those exact strings when listing or describing detection times—do not convert or invent timestamps. Do not use UTC or ISO format with Z for the user-facing answer unless the tool result only provides Unix timestamps without local time fields.
|
Always present times to the user in the server's local timezone. When tool results include start_time_local and end_time_local, use those exact strings when listing or describing detection times—do not convert or invent timestamps. Do not use UTC or ISO format with Z for the user-facing answer unless the tool result only provides Unix timestamps without local time fields.
|
||||||
When users ask about "today", "yesterday", "this week", etc., use the current date above as reference.
|
When users ask about "today", "yesterday", "this week", etc., use the current date above as reference.
|
||||||
When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
|
When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
|
||||||
Always be accurate with time calculations based on the current date provided.{cameras_section}{live_image_note}"""
|
Always be accurate with time calculations based on the current date provided.{cameras_section}"""
|
||||||
|
|
||||||
conversation.append(
|
conversation.append(
|
||||||
{
|
{
|
||||||
@ -696,7 +773,6 @@ Always be accurate with time calculations based on the current date provided.{ca
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
first_user_message_seen = False
|
|
||||||
for msg in body.messages:
|
for msg in body.messages:
|
||||||
msg_dict = {
|
msg_dict = {
|
||||||
"role": msg.role,
|
"role": msg.role,
|
||||||
@ -707,21 +783,6 @@ Always be accurate with time calculations based on the current date provided.{ca
|
|||||||
if msg.name:
|
if msg.name:
|
||||||
msg_dict["name"] = msg.name
|
msg_dict["name"] = msg.name
|
||||||
|
|
||||||
if (
|
|
||||||
msg.role == "user"
|
|
||||||
and not first_user_message_seen
|
|
||||||
and body.include_live_image
|
|
||||||
):
|
|
||||||
first_user_message_seen = True
|
|
||||||
image_url = await _get_live_frame_image_url(
|
|
||||||
request, body.include_live_image, allowed_cameras
|
|
||||||
)
|
|
||||||
if image_url:
|
|
||||||
msg_dict["content"] = [
|
|
||||||
{"type": "text", "text": msg.content},
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
|
||||||
]
|
|
||||||
|
|
||||||
conversation.append(msg_dict)
|
conversation.append(msg_dict)
|
||||||
|
|
||||||
tool_iterations = 0
|
tool_iterations = 0
|
||||||
@ -779,11 +840,16 @@ Always be accurate with time calculations based on the current date provided.{ca
|
|||||||
msg.get("content"), pending
|
msg.get("content"), pending
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
executed_calls, tool_results = await _execute_pending_tools(
|
(
|
||||||
|
executed_calls,
|
||||||
|
tool_results,
|
||||||
|
extra_msgs,
|
||||||
|
) = await _execute_pending_tools(
|
||||||
pending, request, allowed_cameras
|
pending, request, allowed_cameras
|
||||||
)
|
)
|
||||||
stream_tool_calls.extend(executed_calls)
|
stream_tool_calls.extend(executed_calls)
|
||||||
conversation.extend(tool_results)
|
conversation.extend(tool_results)
|
||||||
|
conversation.extend(extra_msgs)
|
||||||
yield (
|
yield (
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
@ -890,11 +956,12 @@ Always be accurate with time calculations based on the current date provided.{ca
|
|||||||
f"Tool calls detected (iteration {tool_iterations}/{max_iterations}): "
|
f"Tool calls detected (iteration {tool_iterations}/{max_iterations}): "
|
||||||
f"{len(pending_tool_calls)} tool(s) to execute"
|
f"{len(pending_tool_calls)} tool(s) to execute"
|
||||||
)
|
)
|
||||||
executed_calls, tool_results = await _execute_pending_tools(
|
executed_calls, tool_results, extra_msgs = await _execute_pending_tools(
|
||||||
pending_tool_calls, request, allowed_cameras
|
pending_tool_calls, request, allowed_cameras
|
||||||
)
|
)
|
||||||
tool_calls.extend(executed_calls)
|
tool_calls.extend(executed_calls)
|
||||||
conversation.extend(tool_results)
|
conversation.extend(tool_results)
|
||||||
|
conversation.extend(extra_msgs)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Added {len(tool_results)} tool result(s) to conversation. "
|
f"Added {len(tool_results)} tool result(s) to conversation. "
|
||||||
f"Continuing with next LLM call..."
|
f"Continuing with next LLM call..."
|
||||||
|
|||||||
@ -32,13 +32,6 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
le=10,
|
le=10,
|
||||||
description="Maximum number of tool call iterations (default: 5)",
|
description="Maximum number of tool call iterations (default: 5)",
|
||||||
)
|
)
|
||||||
include_live_image: Optional[str] = Field(
|
|
||||||
default=None,
|
|
||||||
description=(
|
|
||||||
"If set, the current live frame from this camera is attached to the first "
|
|
||||||
"user message as multimodal content. Use with get_live_context for detection info."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
stream: bool = Field(
|
stream: bool = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="If true, stream the final assistant response in the body as newline-delimited JSON.",
|
description="If true, stream the final assistant response in the body as newline-delimited JSON.",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user