diff --git a/frigate/api/chat.py b/frigate/api/chat.py index 0543d5f8a..939512dd5 100644 --- a/frigate/api/chat.py +++ b/frigate/api/chat.py @@ -36,6 +36,7 @@ from frigate.api.defs.response.chat_response import ( ) from frigate.api.defs.tags import Tags from frigate.api.event import events +from frigate.config import FrigateConfig from frigate.genai.utils import build_assistant_message_for_conversation from frigate.jobs.vlm_watch import ( get_vlm_watch_job, @@ -401,9 +402,38 @@ def get_tools() -> JSONResponse: return JSONResponse(content={"tools": tools}) +def _resolve_zones( + zones: List[str], + config: FrigateConfig, + target_cameras: List[str], +) -> List[str]: + """Map zone names to their canonical config keys, case-insensitively. + + LLMs frequently echo a user's casing ("Front Yard") instead of the + configured key ("front_yard"). The downstream zone filter is a SQLite GLOB + over the JSON-encoded zones column, which is case-sensitive — so an + unnormalized name silently returns zero matches. Build a lookup over the + relevant cameras' configured zones and substitute when we find a match; + unknown names pass through so behavior matches what the model asked for. + """ + if not zones: + return zones + + lookup: Dict[str, str] = {} + for camera_id in target_cameras: + camera_config = config.cameras.get(camera_id) + if camera_config is None: + continue + for zone_name in camera_config.zones.keys(): + lookup.setdefault(zone_name.lower(), zone_name) + + return [lookup.get(z.lower(), z) for z in zones] + + async def _execute_search_objects( arguments: Dict[str, Any], allowed_cameras: List[str], + config: FrigateConfig, ) -> JSONResponse: """ Execute the search_objects tool. @@ -437,6 +467,11 @@ async def _execute_search_objects( # Convert zones array to comma-separated string if provided zones = arguments.get("zones") if isinstance(zones, list): + camera_arg = arguments.get("camera") + target_cameras = ( + [camera_arg] if camera_arg and camera_arg != "all" else allowed_cameras + ) + zones = _resolve_zones(zones, config, target_cameras) zones = ",".join(zones) elif zones is None: zones = "all" @@ -528,6 +563,11 @@ async def _execute_find_similar_objects( sub_labels = arguments.get("sub_labels") zones = arguments.get("zones") + if zones: + zones = _resolve_zones( + zones, request.app.frigate_config, cameras or list(allowed_cameras) + ) + similarity_mode = arguments.get("similarity_mode", "fused") if similarity_mode not in ("visual", "semantic", "fused"): similarity_mode = "fused" @@ -655,7 +695,9 @@ async def execute_tool( logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}") if tool_name == "search_objects": - return await _execute_search_objects(arguments, allowed_cameras) + return await _execute_search_objects( + arguments, allowed_cameras, request.app.frigate_config + ) if tool_name == "find_similar_objects": result = await _execute_find_similar_objects( @@ -835,7 +877,9 @@ async def _execute_tool_internal( This is used by the chat completion endpoint to execute tools. """ if tool_name == "search_objects": - response = await _execute_search_objects(arguments, allowed_cameras) + response = await _execute_search_objects( + arguments, allowed_cameras, request.app.frigate_config + ) try: if hasattr(response, "body"): body_str = response.body.decode("utf-8") @@ -899,6 +943,9 @@ async def _execute_start_camera_watch( await require_camera_access(camera, request=request) + if zones: + zones = _resolve_zones(zones, config, [camera]) + genai_manager = request.app.genai_manager chat_client = genai_manager.chat_client if chat_client is None or not chat_client.supports_vision: diff --git a/frigate/data_processing/post/review_descriptions.py b/frigate/data_processing/post/review_descriptions.py index 536b57f3c..7f1f65658 100644 --- a/frigate/data_processing/post/review_descriptions.py +++ b/frigate/data_processing/post/review_descriptions.py @@ -39,6 +39,8 @@ logger = logging.getLogger(__name__) RECORDING_BUFFER_EXTENSION_PERCENT = 0.10 MIN_RECORDING_DURATION = 10 +MAX_IMAGE_TOKENS = 24000 +MAX_FRAMES_PER_SECOND = 2 class ReviewDescriptionProcessor(PostProcessorApi): @@ -60,14 +62,22 @@ class ReviewDescriptionProcessor(PostProcessorApi): def calculate_frame_count( self, camera: str, + duration: float, image_source: ImageSourceEnum = ImageSourceEnum.preview, height: int = 480, ) -> int: - """Calculate optimal number of frames based on context size, image source, and resolution. + """Calculate optimal number of frames based on event duration, context size, + image source, and resolution. - Token usage varies by resolution: larger images (ultra-wide aspect ratios) use more tokens. - Estimates ~1 token per 1250 pixels. Targets 98% context utilization with safety margin. - Capped at 20 frames. + Per-image token cost is asked of the GenAI provider so providers that know + their model's true cost (e.g. llama.cpp can probe the loaded mmproj) can + diverge from the default ~1-token-per-1250-pixels heuristic. The frame + budget is bounded by: + - remaining context window after prompt + response reservations + - a fixed MAX_IMAGE_TOKENS ceiling + - MAX_FRAMES_PER_SECOND x duration, to avoid drowning short events in + near-duplicate frames where the model latches onto the redundant middle + and skips the start/end action """ client = self.genai_manager.description_client @@ -105,14 +115,15 @@ class ReviewDescriptionProcessor(PostProcessorApi): width = target_width height = int(target_width / aspect_ratio) - pixels_per_image = width * height - tokens_per_image = pixels_per_image / 1250 + tokens_per_image = client.estimate_image_tokens(width, height) prompt_tokens = 3800 response_tokens = 300 - available_tokens = context_size - prompt_tokens - response_tokens - max_frames = int(available_tokens / tokens_per_image) - - return min(max(max_frames, 3), 20) + context_budget = context_size - prompt_tokens - response_tokens + image_token_budget = min(context_budget, MAX_IMAGE_TOKENS) + max_frames_by_tokens = int(image_token_budget / tokens_per_image) + max_frames_by_duration = int(duration * MAX_FRAMES_PER_SECOND) + max_frames = min(max_frames_by_tokens, max_frames_by_duration) + return max(max_frames, 3) def process_data( self, data: dict[str, Any], data_type: PostProcessDataEnum @@ -376,7 +387,9 @@ class ReviewDescriptionProcessor(PostProcessorApi): all_frames.append(os.path.join(preview_dir, file)) frame_count = len(all_frames) - desired_frame_count = self.calculate_frame_count(camera) + desired_frame_count = self.calculate_frame_count( + camera, duration=end_time - start_time + ) if frame_count <= desired_frame_count: return all_frames @@ -400,7 +413,7 @@ class ReviewDescriptionProcessor(PostProcessorApi): """Get frames from recordings at specified timestamps.""" duration = end_time - start_time desired_frame_count = self.calculate_frame_count( - camera, ImageSourceEnum.recordings, height + camera, duration, ImageSourceEnum.recordings, height ) # Calculate evenly spaced timestamps throughout the duration diff --git a/frigate/data_processing/post/types.py b/frigate/data_processing/post/types.py index 02d27ccce..b4deb1ddd 100644 --- a/frigate/data_processing/post/types.py +++ b/frigate/data_processing/post/types.py @@ -4,6 +4,10 @@ from pydantic import BaseModel, ConfigDict, Field class ReviewMetadata(BaseModel): model_config = ConfigDict(extra="ignore", protected_namespaces=()) + observations: list[str] = Field( + default_factory=list, + description="Chronological list of significant observations from the frames, written before the scene narrative is composed.", + ) title: str = Field( description="A short title characterizing what took place and where, under 10 words." ) diff --git a/frigate/genai/__init__.py b/frigate/genai/__init__.py index a38f10de5..203619dea 100644 --- a/frigate/genai/__init__.py +++ b/frigate/genai/__init__.py @@ -163,6 +163,38 @@ Each line represents a detection state, not necessarily unique individuals. The if prop is not None: prop.update(hints) + # observations is a chain-of-thought-by-schema field: forcing the model + # to enumerate concrete facts before writing scene/title surfaces details + # the narrative would otherwise gloss past (e.g. brief vehicle arrivals + # overshadowed by a longer activity). The minItems floor scales with + # event duration so longer clips get more observations. + observations_prop = schema.get("properties", {}).get("observations") + if observations_prop is not None: + duration_seconds = float(review_data.get("duration") or 0) + min_observations = max(3, round(duration_seconds / 5)) + max_observations = min_observations + 8 + observations_prop["description"] = ( + "Enumerate the significant observations across all frames, in " + "chronological order, BEFORE composing the scene narrative. " + "Include the very start of the activity — for example, a " + "vehicle entering the frame or pulling into the driveway — " + "even if it lasts only a few frames and the rest of the clip " + "is dominated by a longer activity. Include each arrival, " + "departure, motion event, object handled, and notable change " + "in position or state. Each item is a single concrete fact " + "written as a complete sentence (e.g., 'A blue sedan turns " + "from the street into the driveway', 'Nick exits the driver " + "side carrying a plant pot'). Do not summarize, interpret, or " + "assign meaning here — that belongs in the scene field." + ) + observations_prop["minItems"] = min_observations + observations_prop["maxItems"] = max_observations + observations_prop["items"] = {"type": "string", "minLength": 20} + + required = schema.setdefault("required", []) + if "observations" not in required: + required.append("observations") + # OpenAI strict mode requires additionalProperties: false on all objects schema["additionalProperties"] = False @@ -356,6 +388,14 @@ Guidelines: """Get the context window size for this provider in tokens.""" return 4096 + def estimate_image_tokens(self, width: int, height: int) -> float: + """Estimate prompt tokens consumed by a single image of the given dimensions. + + Default heuristic: ~1 token per 1250 pixels. Providers that can measure or + know their model's exact image-token cost should override. + """ + return (width * height) / 1250 + def embed( self, texts: list[str] | None = None, diff --git a/frigate/genai/llama_cpp.py b/frigate/genai/llama_cpp.py index e5e9883b8..58c5a707d 100644 --- a/frigate/genai/llama_cpp.py +++ b/frigate/genai/llama_cpp.py @@ -42,6 +42,8 @@ class LlamaCppClient(GenAIClient): _supports_vision: bool _supports_audio: bool _supports_tools: bool + _image_token_cache: dict[tuple[int, int], int] + _text_baseline_tokens: int | None def _init_provider(self) -> str | None: """Initialize the client and query model metadata from the server.""" @@ -52,6 +54,8 @@ class LlamaCppClient(GenAIClient): self._supports_vision = False self._supports_audio = False self._supports_tools = False + self._image_token_cache = {} + self._text_baseline_tokens = None base_url = ( self.genai_config.base_url.rstrip("/") @@ -272,6 +276,91 @@ class LlamaCppClient(GenAIClient): return self._context_size return 4096 + def estimate_image_tokens(self, width: int, height: int) -> float: + """Probe the llama.cpp server to learn the model's image-token cost at the + requested dimensions. + + llama.cpp's image tokenization is a deterministic function of dimensions and + the loaded mmproj, so the result is cached per (width, height) for the + lifetime of the process. Falls back to the base pixel heuristic if the + server is unreachable or the response is malformed. + """ + if self.provider is None: + return super().estimate_image_tokens(width, height) + + cached = self._image_token_cache.get((width, height)) + + if cached is not None: + return cached + + try: + baseline = self._probe_baseline_tokens() + with_image = self._probe_image_prompt_tokens(width, height) + tokens = max(1, with_image - baseline) + except Exception as e: + logger.debug( + "llama.cpp image-token probe failed for %dx%d (%s); using heuristic", + width, + height, + e, + ) + return super().estimate_image_tokens(width, height) + + self._image_token_cache[(width, height)] = tokens + logger.debug( + "llama.cpp model '%s' uses ~%d tokens for %dx%d images", + self.genai_config.model, + tokens, + width, + height, + ) + return tokens + + def _probe_baseline_tokens(self) -> int: + """Return prompt_tokens for a minimal text-only request. Cached after first call.""" + if self._text_baseline_tokens is not None: + return self._text_baseline_tokens + + self._text_baseline_tokens = self._probe_prompt_tokens( + [{"type": "text", "text": "."}] + ) + return self._text_baseline_tokens + + def _probe_image_prompt_tokens(self, width: int, height: int) -> int: + """Return prompt_tokens for a single synthetic image plus minimal text.""" + img = Image.new("RGB", (width, height), (128, 128, 128)) + buf = io.BytesIO() + img.save(buf, format="JPEG", quality=60) + encoded = base64.b64encode(buf.getvalue()).decode("utf-8") + return self._probe_prompt_tokens( + [ + {"type": "text", "text": "."}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}, + }, + ] + ) + + def _probe_prompt_tokens(self, content: list[dict[str, Any]]) -> int: + """POST a 1-token chat completion and return reported prompt_tokens. + + Uses a generous timeout to absorb a cold model load on the first probe + when the server lazily loads models on demand (e.g. llama-swap). + """ + payload = { + "model": self.genai_config.model, + "messages": [{"role": "user", "content": content}], + "max_tokens": 1, + } + response = requests.post( + f"{self.provider}/v1/chat/completions", + json=payload, + timeout=60, + ) + response.raise_for_status() + return int(response.json()["usage"]["prompt_tokens"]) + def _build_payload( self, messages: list[dict[str, Any]], diff --git a/web/src/components/chat/ChatMessage.tsx b/web/src/components/chat/ChatMessage.tsx index c5f92b5f4..6478b48fc 100644 --- a/web/src/components/chat/ChatMessage.tsx +++ b/web/src/components/chat/ChatMessage.tsx @@ -155,14 +155,40 @@ export function MessageBubble({ ) : (
+ ),
table: ({ node: _n, ...props }) => (