GenAI Optimizations (#23006)

* Test for image token usage in llama.cpp so we can more appropriately decide how many frames to include * Limit based on frames per second * handle zone case sensitivity * Improve formatting * Add observations field so model can build CoT before outputting used fields
2026-05-01 19:17:41 +03:00 · 2026-04-25 16:38:18 -06:00 · 2026-04-25 16:38:18 -06:00 · 0ea8924727
commit 0ea8924727
parent 1a1994ca17
6 changed files with 235 additions and 16 deletions
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@ -36,6 +36,7 @@ from frigate.api.defs.response.chat_response import (
 )
 from frigate.api.defs.tags import Tags
 from frigate.api.event import events
+from frigate.config import FrigateConfig
 from frigate.genai.utils import build_assistant_message_for_conversation
 from frigate.jobs.vlm_watch import (
    get_vlm_watch_job,
@ -401,9 +402,38 @@ def get_tools() -> JSONResponse:
    return JSONResponse(content={"tools": tools})


+def _resolve_zones(
+    zones: List[str],
+    config: FrigateConfig,
+    target_cameras: List[str],
+) -> List[str]:
+    """Map zone names to their canonical config keys, case-insensitively.
+
+    LLMs frequently echo a user's casing ("Front Yard") instead of the
+    configured key ("front_yard"). The downstream zone filter is a SQLite GLOB
+    over the JSON-encoded zones column, which is case-sensitive — so an
+    unnormalized name silently returns zero matches. Build a lookup over the
+    relevant cameras' configured zones and substitute when we find a match;
+    unknown names pass through so behavior matches what the model asked for.
+    """
+    if not zones:
+        return zones
+
+    lookup: Dict[str, str] = {}
+    for camera_id in target_cameras:
+        camera_config = config.cameras.get(camera_id)
+        if camera_config is None:
+            continue
+        for zone_name in camera_config.zones.keys():
+            lookup.setdefault(zone_name.lower(), zone_name)
+
+    return [lookup.get(z.lower(), z) for z in zones]
+
+
 async def _execute_search_objects(
    arguments: Dict[str, Any],
    allowed_cameras: List[str],
+    config: FrigateConfig,
 ) -> JSONResponse:
    """
    Execute the search_objects tool.
@ -437,6 +467,11 @@ async def _execute_search_objects(
    # Convert zones array to comma-separated string if provided
    zones = arguments.get("zones")
    if isinstance(zones, list):
+        camera_arg = arguments.get("camera")
+        target_cameras = (
+            [camera_arg] if camera_arg and camera_arg != "all" else allowed_cameras
+        )
+        zones = _resolve_zones(zones, config, target_cameras)
        zones = ",".join(zones)
    elif zones is None:
        zones = "all"
@ -528,6 +563,11 @@ async def _execute_find_similar_objects(
    sub_labels = arguments.get("sub_labels")
    zones = arguments.get("zones")

+    if zones:
+        zones = _resolve_zones(
+            zones, request.app.frigate_config, cameras or list(allowed_cameras)
+        )
+
    similarity_mode = arguments.get("similarity_mode", "fused")
    if similarity_mode not in ("visual", "semantic", "fused"):
        similarity_mode = "fused"
@ -655,7 +695,9 @@ async def execute_tool(
    logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")

    if tool_name == "search_objects":
-        return await _execute_search_objects(arguments, allowed_cameras)
+        return await _execute_search_objects(
+            arguments, allowed_cameras, request.app.frigate_config
+        )

    if tool_name == "find_similar_objects":
        result = await _execute_find_similar_objects(
@ -835,7 +877,9 @@ async def _execute_tool_internal(
    This is used by the chat completion endpoint to execute tools.
    """
    if tool_name == "search_objects":
-        response = await _execute_search_objects(arguments, allowed_cameras)
+        response = await _execute_search_objects(
+            arguments, allowed_cameras, request.app.frigate_config
+        )
        try:
            if hasattr(response, "body"):
                body_str = response.body.decode("utf-8")
@ -899,6 +943,9 @@ async def _execute_start_camera_watch(

    await require_camera_access(camera, request=request)

+    if zones:
+        zones = _resolve_zones(zones, config, [camera])
+
    genai_manager = request.app.genai_manager
    chat_client = genai_manager.chat_client
    if chat_client is None or not chat_client.supports_vision:
--- a/frigate/data_processing/post/review_descriptions.py
+++ b/frigate/data_processing/post/review_descriptions.py
@ -39,6 +39,8 @@ logger = logging.getLogger(__name__)

 RECORDING_BUFFER_EXTENSION_PERCENT = 0.10
 MIN_RECORDING_DURATION = 10
+MAX_IMAGE_TOKENS = 24000
+MAX_FRAMES_PER_SECOND = 2


 class ReviewDescriptionProcessor(PostProcessorApi):
@ -60,14 +62,22 @@ class ReviewDescriptionProcessor(PostProcessorApi):
    def calculate_frame_count(
        self,
        camera: str,
+        duration: float,
        image_source: ImageSourceEnum = ImageSourceEnum.preview,
        height: int = 480,
    ) -> int:
-        """Calculate optimal number of frames based on context size, image source, and resolution.
+        """Calculate optimal number of frames based on event duration, context size,
+        image source, and resolution.

-        Token usage varies by resolution: larger images (ultra-wide aspect ratios) use more tokens.
-        Estimates ~1 token per 1250 pixels. Targets 98% context utilization with safety margin.
-        Capped at 20 frames.
+        Per-image token cost is asked of the GenAI provider so providers that know
+        their model's true cost (e.g. llama.cpp can probe the loaded mmproj) can
+        diverge from the default ~1-token-per-1250-pixels heuristic. The frame
+        budget is bounded by:
+          - remaining context window after prompt + response reservations
+          - a fixed MAX_IMAGE_TOKENS ceiling
+          - MAX_FRAMES_PER_SECOND x duration, to avoid drowning short events in
+            near-duplicate frames where the model latches onto the redundant middle
+            and skips the start/end action
        """
        client = self.genai_manager.description_client

@ -105,14 +115,15 @@ class ReviewDescriptionProcessor(PostProcessorApi):
                width = target_width
                height = int(target_width / aspect_ratio)

-        pixels_per_image = width * height
-        tokens_per_image = pixels_per_image / 1250
+        tokens_per_image = client.estimate_image_tokens(width, height)
        prompt_tokens = 3800
        response_tokens = 300
-        available_tokens = context_size - prompt_tokens - response_tokens
-        max_frames = int(available_tokens / tokens_per_image)
-
-        return min(max(max_frames, 3), 20)
+        context_budget = context_size - prompt_tokens - response_tokens
+        image_token_budget = min(context_budget, MAX_IMAGE_TOKENS)
+        max_frames_by_tokens = int(image_token_budget / tokens_per_image)
+        max_frames_by_duration = int(duration * MAX_FRAMES_PER_SECOND)
+        max_frames = min(max_frames_by_tokens, max_frames_by_duration)
+        return max(max_frames, 3)

    def process_data(
        self, data: dict[str, Any], data_type: PostProcessDataEnum
@ -376,7 +387,9 @@ class ReviewDescriptionProcessor(PostProcessorApi):
            all_frames.append(os.path.join(preview_dir, file))

        frame_count = len(all_frames)
-        desired_frame_count = self.calculate_frame_count(camera)
+        desired_frame_count = self.calculate_frame_count(
+            camera, duration=end_time - start_time
+        )

        if frame_count <= desired_frame_count:
            return all_frames
@ -400,7 +413,7 @@ class ReviewDescriptionProcessor(PostProcessorApi):
        """Get frames from recordings at specified timestamps."""
        duration = end_time - start_time
        desired_frame_count = self.calculate_frame_count(
-            camera, ImageSourceEnum.recordings, height
+            camera, duration, ImageSourceEnum.recordings, height
        )

        # Calculate evenly spaced timestamps throughout the duration
--- a/frigate/data_processing/post/types.py
+++ b/frigate/data_processing/post/types.py
@ -4,6 +4,10 @@ from pydantic import BaseModel, ConfigDict, Field
 class ReviewMetadata(BaseModel):
    model_config = ConfigDict(extra="ignore", protected_namespaces=())

+    observations: list[str] = Field(
+        default_factory=list,
+        description="Chronological list of significant observations from the frames, written before the scene narrative is composed.",
+    )
    title: str = Field(
        description="A short title characterizing what took place and where, under 10 words."
    )
--- a/frigate/genai/init.py
+++ b/frigate/genai/init.py
@ -163,6 +163,38 @@ Each line represents a detection state, not necessarily unique individuals. The
            if prop is not None:
                prop.update(hints)

+        # observations is a chain-of-thought-by-schema field: forcing the model
+        # to enumerate concrete facts before writing scene/title surfaces details
+        # the narrative would otherwise gloss past (e.g. brief vehicle arrivals
+        # overshadowed by a longer activity). The minItems floor scales with
+        # event duration so longer clips get more observations.
+        observations_prop = schema.get("properties", {}).get("observations")
+        if observations_prop is not None:
+            duration_seconds = float(review_data.get("duration") or 0)
+            min_observations = max(3, round(duration_seconds / 5))
+            max_observations = min_observations + 8
+            observations_prop["description"] = (
+                "Enumerate the significant observations across all frames, in "
+                "chronological order, BEFORE composing the scene narrative. "
+                "Include the very start of the activity — for example, a "
+                "vehicle entering the frame or pulling into the driveway — "
+                "even if it lasts only a few frames and the rest of the clip "
+                "is dominated by a longer activity. Include each arrival, "
+                "departure, motion event, object handled, and notable change "
+                "in position or state. Each item is a single concrete fact "
+                "written as a complete sentence (e.g., 'A blue sedan turns "
+                "from the street into the driveway', 'Nick exits the driver "
+                "side carrying a plant pot'). Do not summarize, interpret, or "
+                "assign meaning here — that belongs in the scene field."
+            )
+            observations_prop["minItems"] = min_observations
+            observations_prop["maxItems"] = max_observations
+            observations_prop["items"] = {"type": "string", "minLength": 20}
+
+            required = schema.setdefault("required", [])
+            if "observations" not in required:
+                required.append("observations")
+
        # OpenAI strict mode requires additionalProperties: false on all objects
        schema["additionalProperties"] = False

@ -356,6 +388,14 @@ Guidelines:
        """Get the context window size for this provider in tokens."""
        return 4096

+    def estimate_image_tokens(self, width: int, height: int) -> float:
+        """Estimate prompt tokens consumed by a single image of the given dimensions.
+
+        Default heuristic: ~1 token per 1250 pixels. Providers that can measure or
+        know their model's exact image-token cost should override.
+        """
+        return (width * height) / 1250
+
    def embed(
        self,
        texts: list[str] | None = None,
--- a/frigate/genai/llama_cpp.py
+++ b/frigate/genai/llama_cpp.py
@ -42,6 +42,8 @@ class LlamaCppClient(GenAIClient):
    _supports_vision: bool
    _supports_audio: bool
    _supports_tools: bool
+    _image_token_cache: dict[tuple[int, int], int]
+    _text_baseline_tokens: int | None

    def _init_provider(self) -> str | None:
        """Initialize the client and query model metadata from the server."""
@ -52,6 +54,8 @@ class LlamaCppClient(GenAIClient):
        self._supports_vision = False
        self._supports_audio = False
        self._supports_tools = False
+        self._image_token_cache = {}
+        self._text_baseline_tokens = None

        base_url = (
            self.genai_config.base_url.rstrip("/")
@ -272,6 +276,91 @@ class LlamaCppClient(GenAIClient):
            return self._context_size
        return 4096

+    def estimate_image_tokens(self, width: int, height: int) -> float:
+        """Probe the llama.cpp server to learn the model's image-token cost at the
+        requested dimensions.
+
+        llama.cpp's image tokenization is a deterministic function of dimensions and
+        the loaded mmproj, so the result is cached per (width, height) for the
+        lifetime of the process. Falls back to the base pixel heuristic if the
+        server is unreachable or the response is malformed.
+        """
+        if self.provider is None:
+            return super().estimate_image_tokens(width, height)
+
+        cached = self._image_token_cache.get((width, height))
+
+        if cached is not None:
+            return cached
+
+        try:
+            baseline = self._probe_baseline_tokens()
+            with_image = self._probe_image_prompt_tokens(width, height)
+            tokens = max(1, with_image - baseline)
+        except Exception as e:
+            logger.debug(
+                "llama.cpp image-token probe failed for %dx%d (%s); using heuristic",
+                width,
+                height,
+                e,
+            )
+            return super().estimate_image_tokens(width, height)
+
+        self._image_token_cache[(width, height)] = tokens
+        logger.debug(
+            "llama.cpp model '%s' uses ~%d tokens for %dx%d images",
+            self.genai_config.model,
+            tokens,
+            width,
+            height,
+        )
+        return tokens
+
+    def _probe_baseline_tokens(self) -> int:
+        """Return prompt_tokens for a minimal text-only request. Cached after first call."""
+        if self._text_baseline_tokens is not None:
+            return self._text_baseline_tokens
+
+        self._text_baseline_tokens = self._probe_prompt_tokens(
+            [{"type": "text", "text": "."}]
+        )
+        return self._text_baseline_tokens
+
+    def _probe_image_prompt_tokens(self, width: int, height: int) -> int:
+        """Return prompt_tokens for a single synthetic image plus minimal text."""
+        img = Image.new("RGB", (width, height), (128, 128, 128))
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=60)
+        encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
+        return self._probe_prompt_tokens(
+            [
+                {"type": "text", "text": "."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                },
+            ]
+        )
+
+    def _probe_prompt_tokens(self, content: list[dict[str, Any]]) -> int:
+        """POST a 1-token chat completion and return reported prompt_tokens.
+
+        Uses a generous timeout to absorb a cold model load on the first probe
+        when the server lazily loads models on demand (e.g. llama-swap).
+        """
+        payload = {
+            "model": self.genai_config.model,
+            "messages": [{"role": "user", "content": content}],
+            "max_tokens": 1,
+        }
+        response = requests.post(
+            f"{self.provider}/v1/chat/completions",
+            json=payload,
+            timeout=60,
+        )
+        response.raise_for_status()
+        return int(response.json()["usage"]["prompt_tokens"])
+
    def _build_payload(
        self,
        messages: list[dict[str, Any]],
--- a/web/src/components/chat/ChatMessage.tsx
+++ b/web/src/components/chat/ChatMessage.tsx
@ -155,14 +155,40 @@ export function MessageBubble({
        ) : (
          <div
            className={cn(
-              "[&>*:last-child]:inline",
              !isComplete &&
-                "after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
+                "[&>p:last-child]:inline after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
            )}
          >
            <ReactMarkdown
              remarkPlugins={[remarkGfm]}
              components={{
+                p: ({ node: _n, ...props }) => (
+                  <p className="my-2 first:mt-0 last:mb-0" {...props} />
+                ),
+                ul: ({ node: _n, ...props }) => (
+                  <ul
+                    className="my-2 list-disc space-y-1 pl-6 first:mt-0 last:mb-0"
+                    {...props}
+                  />
+                ),
+                ol: ({ node: _n, ...props }) => (
+                  <ol
+                    className="my-2 list-decimal space-y-1 pl-6 first:mt-0 last:mb-0"
+                    {...props}
+                  />
+                ),
+                li: ({ node: _n, ...props }) => (
+                  <li className="pl-1" {...props} />
+                ),
+                code: ({ node: _n, className, ...props }) => (
+                  <code
+                    className={cn(
+                      "rounded bg-foreground/10 px-1 py-0.5 font-mono text-sm",
+                      className,
+                    )}
+                    {...props}
+                  />
+                ),
                table: ({ node: _n, ...props }) => (
                  <table
                    className="my-2 w-full border-collapse border border-border"