Implement semantic query for chat (#23206)

2026-06-21 03:41:55 +03:00 · 2026-05-15 13:32:53 -06:00 · 2026-05-15 13:32:53 -06:00 · b712e1fbd9
commit b712e1fbd9
parent c6eadfebb8
1 changed files with 254 additions and 58 deletions
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@ -68,62 +68,123 @@ class VLMMonitorRequest(BaseModel):
    zones: List[str] = []
-def get_tool_definitions() -> List[Dict[str, Any]]:
+def get_tool_definitions(
    semantic_search_enabled: bool = False,
 ) -> List[Dict[str, Any]]:
    """
    Get OpenAI-compatible tool definitions for Frigate.
    Returns a list of tool definitions that can be used with OpenAI-compatible
-    function calling APIs.
+    function calling APIs. When semantic search is enabled, the search_objects
    tool exposes an additional `semantic_query` parameter for descriptive
    queries (e.g. "person riding a lawn mower") and find_similar_objects is
    included.
    """
    search_objects_properties: Dict[str, Any] = {
        "camera": {
            "type": "string",
            "description": "Camera name to filter by (optional).",
        },
        "label": {
            "type": "string",
            "description": (
                "Generic object class to filter by — one of the tracked detector "
                "labels such as 'person', 'package', 'car', 'dog', 'bird'. Use "
                "this for broad queries like 'show me all cars today'. Combine "
                "with semantic_query when the user also describes appearance or "
                "behavior (e.g. label='person', semantic_query='riding a lawn "
                "mower')."
            ),
        },
        "sub_label": {
            "type": "string",
            "description": (
                "Filter by a DISCRETE NAMED entity recognized in the detection. "
                "Use this for: a known person's name ('John'), a delivery "
                "company ('Amazon', 'UPS'), a recognized animal species or "
                "breed ('blue jay', 'cardinal', 'golden retriever'), or a "
                "license plate string. When filtering by a specific name, set "
                "only sub_label and leave label unset. Do NOT use sub_label "
                "for descriptions of appearance, clothing, or actions — those "
                "belong in semantic_query."
            ),
        },
        "after": {
            "type": "string",
            "description": "Start time in ISO 8601 format (e.g., '2024-01-01T00:00:00Z').",
        },
        "before": {
            "type": "string",
            "description": "End time in ISO 8601 format (e.g., '2024-01-01T23:59:59Z').",
        },
        "zones": {
            "type": "array",
            "items": {"type": "string"},
            "description": "List of zone names to filter by.",
        },
        "limit": {
            "type": "integer",
            "description": "Maximum number of objects to return (default: 25).",
            "default": 25,
        },
    }
    if semantic_search_enabled:
        search_objects_properties["semantic_query"] = {
            "type": "string",
            "description": (
                "Optional natural-language description of a PHYSICAL "
                "CHARACTERISTIC, APPEARANCE, or ACTIVITY the user mentioned, "
                "used to semantically narrow results. Only set this when the "
                "user describes something beyond what label and sub_label can "
                "express on their own.\n"
                "USE for descriptive phrases like: 'riding a lawn mower', "
                "'wearing a red jacket', 'carrying a package', 'walking a "
                "dog', 'on a bicycle', 'holding an umbrella'.\n"
                "DO NOT USE for:\n"
                "- specific named people, pets, or delivery companies → use sub_label\n"
                "- animal species or breed names like 'blue jay', 'cardinal', "
                "'golden retriever' → use sub_label\n"
                "- license plate strings → use sub_label\n"
                "- generic object queries like 'all cars today' or 'every "
                "person' → use label alone with no semantic_query\n"
                "When set, combine with label/time/camera/zone filters as "
                "usual (e.g. label='person', semantic_query='riding a lawn "
                "mower', after='2024-05-01T00:00:00Z')."
            ),
        }
    search_objects_description = (
        "Search the historical record of detected objects in Frigate. "
        "Use this ONLY for questions about the PAST — e.g. 'did anyone come by today?', "
        "'when was the last car?', 'show me detections from yesterday'. "
        "Do NOT use this for monitoring or alerting requests about future events — "
        "use start_camera_watch instead for those. "
        "An 'object' in Frigate represents a tracked detection (e.g., a person, package, car).\n\n"
        "Choose filters based on what the user is asking for:\n"
        "- Generic class query ('show me all cars today'): set `label` only.\n"
        "- Specific NAMED entity (known person, delivery company, animal "
        "species/breed like 'blue jay' or 'golden retriever', license "
        "plate): set `sub_label` only and leave `label` unset.\n"
    )
    if semantic_search_enabled:
        search_objects_description += (
            "- Physical CHARACTERISTIC, APPEARANCE, or ACTIVITY that is not a "
            "discrete name ('person riding a lawn mower', 'someone in a red "
            "jacket', 'person carrying a package'): set `semantic_query` with "
            "the descriptive phrase, optionally alongside `label` for the "
            "object class. Do NOT put descriptive phrases in sub_label."
        )
    return [
        {
            "type": "function",
            "function": {
                "name": "search_objects",
-                "description": (
+                "description": search_objects_description,
                    "Search the historical record of detected objects in Frigate. "
                    "Use this ONLY for questions about the PAST — e.g. 'did anyone come by today?', "
                    "'when was the last car?', 'show me detections from yesterday'. "
                    "Do NOT use this for monitoring or alerting requests about future events — "
                    "use start_camera_watch instead for those. "
                    "An 'object' in Frigate represents a tracked detection (e.g., a person, package, car). "
                    "When the user asks about a specific name (person, delivery company, animal, etc.), "
                    "filter by sub_label only and do not set label."
                ),
                "parameters": {
                    "type": "object",
-                    "properties": {
+                    "properties": search_objects_properties,
                        "camera": {
                            "type": "string",
                            "description": "Camera name to filter by (optional).",
                        },
                        "label": {
                            "type": "string",
                            "description": "Object label to filter by (e.g., 'person', 'package', 'car').",
                        },
                        "sub_label": {
                            "type": "string",
                            "description": "Name of a person, delivery company, animal, etc. When filtering by a specific name, use only sub_label; do not set label.",
                        },
                        "after": {
                            "type": "string",
                            "description": "Start time in ISO 8601 format (e.g., '2024-01-01T00:00:00Z').",
                        },
                        "before": {
                            "type": "string",
                            "description": "End time in ISO 8601 format (e.g., '2024-01-01T23:59:59Z').",
                        },
                        "zones": {
                            "type": "array",
                            "items": {"type": "string"},
                            "description": "List of zone names to filter by.",
                        },
                        "limit": {
                            "type": "integer",
                            "description": "Maximum number of objects to return (default: 25).",
                            "default": 25,
                        },
                    },
                },
                "required": [],
            },
@ -397,9 +458,12 @@ def get_tool_definitions() -> List[Dict[str, Any]]:
    summary="Get available tools",
    description="Returns OpenAI-compatible tool definitions for function calling.",
 )
-def get_tools() -> JSONResponse:
+def get_tools(request: Request) -> JSONResponse:
    """Get list of available tools for LLM function calling."""
-    tools = get_tool_definitions()
+    semantic_search_enabled = bool(
        getattr(request.app.frigate_config.semantic_search, "enabled", False)
    )
    tools = get_tool_definitions(semantic_search_enabled=semantic_search_enabled)
    return JSONResponse(content={"tools": tools})
@ -432,16 +496,29 @@ def _resolve_zones(
 async def _execute_search_objects(
    request: Request,
    arguments: Dict[str, Any],
    allowed_cameras: List[str],
    config: FrigateConfig,
 ) -> JSONResponse:
    """
    Execute the search_objects tool.
-    This searches for detected objects (events) in Frigate using the same
+    Routes to the semantic path when the LLM supplied a `semantic_query`
-    logic as the events API endpoint.
+    and semantic search is enabled; otherwise delegates to the standard
    events API logic.
    """
    config = request.app.frigate_config
    semantic_query = arguments.get("semantic_query")
    if isinstance(semantic_query, str):
        semantic_query = semantic_query.strip() or None
    else:
        semantic_query = None
    if semantic_query and getattr(config.semantic_search, "enabled", False):
        return await _execute_search_objects_semantic(
            request, arguments, allowed_cameras, semantic_query
        )
    # Parse after/before as server local time; convert to Unix timestamp
    after = arguments.get("after")
    before = arguments.get("before")
@ -508,6 +585,119 @@ async def _execute_search_objects(
        )
 async def _execute_search_objects_semantic(
    request: Request,
    arguments: Dict[str, Any],
    allowed_cameras: List[str],
    semantic_query: str,
 ) -> JSONResponse:
    """Search objects via fused thumbnail + description embeddings.
    Runs both visual and description vec searches against `semantic_query`,
    intersects the candidates with the structured filters (camera, label,
    sub_label, zones, time window) the LLM supplied, and ranks the survivors
    by fused similarity. Mirrors the candidate-then-filter pattern used by
    find_similar_objects since sqlite-vec's IN filter is unreliable.
    """
    from peewee import fn
    config = request.app.frigate_config
    context = request.app.embeddings
    if context is None:
        logger.warning(
            "semantic_query supplied but embeddings context is unavailable; "
            "returning empty results."
        )
        return JSONResponse(content=[])
    after = parse_iso_to_timestamp(arguments.get("after"))
    before = parse_iso_to_timestamp(arguments.get("before"))
    camera_arg = arguments.get("camera")
    if camera_arg and camera_arg != "all":
        if camera_arg not in allowed_cameras:
            return JSONResponse(content=[])
        cameras = [camera_arg]
    else:
        cameras = list(allowed_cameras) if allowed_cameras else []
    if not cameras:
        return JSONResponse(content=[])
    label = arguments.get("label")
    sub_label = arguments.get("sub_label")
    zones = arguments.get("zones")
    if isinstance(zones, list) and zones:
        zones = _resolve_zones(zones, config, cameras)
    else:
        zones = None
    limit = int(arguments.get("limit", 25))
    limit = max(1, min(limit, 100))
    visual_distances: Dict[str, float] = {}
    description_distances: Dict[str, float] = {}
    try:
        rows = context.search_thumbnail(semantic_query)
        visual_distances = {row[0]: row[1] for row in rows}
    except Exception:
        logger.exception(
            "search_thumbnail failed for semantic_query: %s", semantic_query
        )
    try:
        rows = context.search_description(semantic_query)
        description_distances = {row[0]: row[1] for row in rows}
    except Exception:
        logger.exception(
            "search_description failed for semantic_query: %s", semantic_query
        )
    vec_ids = set(visual_distances) | set(description_distances)
    if not vec_ids:
        return JSONResponse(content=[])
    clauses = [Event.id.in_(list(vec_ids)), Event.camera.in_(cameras)]
    if after is not None:
        clauses.append(Event.start_time >= after)
    if before is not None:
        clauses.append(Event.start_time <= before)
    if label:
        clauses.append(Event.label == label)
    if sub_label:
        # case-insensitive match to mirror events() behavior
        clauses.append(fn.LOWER(Event.sub_label.cast("text")) == sub_label.lower())
    if zones:
        zone_clauses = [Event.zones.cast("text") % f'*"{zone}"*' for zone in zones]
        clauses.append(reduce(operator.or_, zone_clauses))
    eligible = {e.id: e for e in Event.select().where(reduce(operator.and_, clauses))}
    scored: List[tuple[str, float]] = []
    for eid in eligible:
        v_score = (
            distance_to_score(visual_distances[eid], context.thumb_stats)
            if eid in visual_distances
            else None
        )
        d_score = (
            distance_to_score(description_distances[eid], context.desc_stats)
            if eid in description_distances
            else None
        )
        fused = fuse_scores(v_score, d_score)
        if fused is None:
            continue
        scored.append((eid, fused))
    scored.sort(key=lambda pair: pair[1], reverse=True)
    scored = scored[:limit]
    results = [hydrate_event(eligible[eid], score=score) for eid, score in scored]
    return JSONResponse(content=results)
 async def _execute_find_similar_objects(
    request: Request,
    arguments: Dict[str, Any],
@ -696,9 +886,7 @@ async def execute_tool(
    logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")
    if tool_name == "search_objects":
-        return await _execute_search_objects(
+        return await _execute_search_objects(request, arguments, allowed_cameras)
            arguments, allowed_cameras, request.app.frigate_config
        )
    if tool_name == "find_similar_objects":
        result = await _execute_find_similar_objects(
@ -878,9 +1066,7 @@ async def _execute_tool_internal(
    This is used by the chat completion endpoint to execute tools.
    """
    if tool_name == "search_objects":
-        response = await _execute_search_objects(
+        response = await _execute_search_objects(request, arguments, allowed_cameras)
            arguments, allowed_cameras, request.app.frigate_config
        )
        try:
            if hasattr(response, "body"):
                body_str = response.body.decode("utf-8")
@ -1293,7 +1479,9 @@ async def chat_completion(
            status_code=400,
        )
-    tools = get_tool_definitions()
+    config = request.app.frigate_config
    semantic_search_enabled = bool(getattr(config.semantic_search, "enabled", False))
    tools = get_tool_definitions(semantic_search_enabled=semantic_search_enabled)
    conversation = []
    current_datetime = datetime.now()
@ -1301,7 +1489,6 @@ async def chat_completion(
    current_time_str = current_datetime.strftime("%I:%M:%S %p")
    cameras_info = []
    config = request.app.frigate_config
    has_speed_zone = False
    for camera_id in allowed_cameras:
        if camera_id not in config.cameras:
@ -1339,6 +1526,15 @@ async def chat_completion(
        )
        speed_units_section = f"\n\nReport object speeds to the user in {speed_unit}."
    semantic_search_section = ""
    if semantic_search_enabled:
        semantic_search_section = (
            "\n\nWhen routing a search_objects call, pick filters by the shape of the user's request:\n"
            "- Generic class ('show me all cars today'): set `label` only.\n"
            "- Specific named entity — a known person ('John'), delivery company ('Amazon'), animal species/breed ('blue jay', 'cardinal', 'golden retriever'), or license plate: set `sub_label` only and leave `label` unset.\n"
            "- Physical characteristic, appearance, or activity that is NOT a discrete name ('find me people riding a lawn mower', 'someone in a red jacket', 'a person carrying a package'): set `semantic_query` with the descriptive phrase, optionally combined with `label` for the object class. Never put descriptive phrases in `sub_label`."
        )
    system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
 Current server local date and time: {current_date_str} at {current_time_str}
@ -1350,7 +1546,7 @@ When users ask about "today", "yesterday", "this week", etc., use the current da
 When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
 Always be accurate with time calculations based on the current date provided.
-When a user refers to a specific object they have seen or describe with identifying details ("that green car", "the person in the red jacket", "a package left today"), prefer the find_similar_objects tool over search_objects. Use search_objects first only to locate the anchor event, then pass its id to find_similar_objects. For generic queries like "show me all cars today", keep using search_objects. If a user message begins with [attached_event:<id>], treat that event id as the anchor for any similarity or "tell me more" request in the same message and call find_similar_objects with that id.{cameras_section}{speed_units_section}"""
+When a user refers to a specific object they have seen or describe with identifying details ("that green car", "the person in the red jacket", "a package left today"), prefer the find_similar_objects tool over search_objects. Use search_objects first only to locate the anchor event, then pass its id to find_similar_objects. For generic queries like "show me all cars today", keep using search_objects. If a user message begins with [attached_event:<id>], treat that event id as the anchor for any similarity or "tell me more" request in the same message and call find_similar_objects with that id.{semantic_search_section}{cameras_section}{speed_units_section}"""
    conversation.append(
        {