GenAI Optimizations (#23006)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions

* Test for image token usage in llama.cpp so we can more appropriately decide how many frames to include

* Limit based on frames per second

* handle zone case sensitivity

* Improve formatting

* Add observations field so model can build CoT before outputting used fields
This commit is contained in:
Nicolas Mowen 2026-04-25 16:38:18 -06:00 committed by GitHub
parent 1a1994ca17
commit 0ea8924727
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 235 additions and 16 deletions

View File

@ -36,6 +36,7 @@ from frigate.api.defs.response.chat_response import (
)
from frigate.api.defs.tags import Tags
from frigate.api.event import events
from frigate.config import FrigateConfig
from frigate.genai.utils import build_assistant_message_for_conversation
from frigate.jobs.vlm_watch import (
get_vlm_watch_job,
@ -401,9 +402,38 @@ def get_tools() -> JSONResponse:
return JSONResponse(content={"tools": tools})
def _resolve_zones(
zones: List[str],
config: FrigateConfig,
target_cameras: List[str],
) -> List[str]:
"""Map zone names to their canonical config keys, case-insensitively.
LLMs frequently echo a user's casing ("Front Yard") instead of the
configured key ("front_yard"). The downstream zone filter is a SQLite GLOB
over the JSON-encoded zones column, which is case-sensitive so an
unnormalized name silently returns zero matches. Build a lookup over the
relevant cameras' configured zones and substitute when we find a match;
unknown names pass through so behavior matches what the model asked for.
"""
if not zones:
return zones
lookup: Dict[str, str] = {}
for camera_id in target_cameras:
camera_config = config.cameras.get(camera_id)
if camera_config is None:
continue
for zone_name in camera_config.zones.keys():
lookup.setdefault(zone_name.lower(), zone_name)
return [lookup.get(z.lower(), z) for z in zones]
async def _execute_search_objects(
arguments: Dict[str, Any],
allowed_cameras: List[str],
config: FrigateConfig,
) -> JSONResponse:
"""
Execute the search_objects tool.
@ -437,6 +467,11 @@ async def _execute_search_objects(
# Convert zones array to comma-separated string if provided
zones = arguments.get("zones")
if isinstance(zones, list):
camera_arg = arguments.get("camera")
target_cameras = (
[camera_arg] if camera_arg and camera_arg != "all" else allowed_cameras
)
zones = _resolve_zones(zones, config, target_cameras)
zones = ",".join(zones)
elif zones is None:
zones = "all"
@ -528,6 +563,11 @@ async def _execute_find_similar_objects(
sub_labels = arguments.get("sub_labels")
zones = arguments.get("zones")
if zones:
zones = _resolve_zones(
zones, request.app.frigate_config, cameras or list(allowed_cameras)
)
similarity_mode = arguments.get("similarity_mode", "fused")
if similarity_mode not in ("visual", "semantic", "fused"):
similarity_mode = "fused"
@ -655,7 +695,9 @@ async def execute_tool(
logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")
if tool_name == "search_objects":
return await _execute_search_objects(arguments, allowed_cameras)
return await _execute_search_objects(
arguments, allowed_cameras, request.app.frigate_config
)
if tool_name == "find_similar_objects":
result = await _execute_find_similar_objects(
@ -835,7 +877,9 @@ async def _execute_tool_internal(
This is used by the chat completion endpoint to execute tools.
"""
if tool_name == "search_objects":
response = await _execute_search_objects(arguments, allowed_cameras)
response = await _execute_search_objects(
arguments, allowed_cameras, request.app.frigate_config
)
try:
if hasattr(response, "body"):
body_str = response.body.decode("utf-8")
@ -899,6 +943,9 @@ async def _execute_start_camera_watch(
await require_camera_access(camera, request=request)
if zones:
zones = _resolve_zones(zones, config, [camera])
genai_manager = request.app.genai_manager
chat_client = genai_manager.chat_client
if chat_client is None or not chat_client.supports_vision:

View File

@ -39,6 +39,8 @@ logger = logging.getLogger(__name__)
RECORDING_BUFFER_EXTENSION_PERCENT = 0.10
MIN_RECORDING_DURATION = 10
MAX_IMAGE_TOKENS = 24000
MAX_FRAMES_PER_SECOND = 2
class ReviewDescriptionProcessor(PostProcessorApi):
@ -60,14 +62,22 @@ class ReviewDescriptionProcessor(PostProcessorApi):
def calculate_frame_count(
self,
camera: str,
duration: float,
image_source: ImageSourceEnum = ImageSourceEnum.preview,
height: int = 480,
) -> int:
"""Calculate optimal number of frames based on context size, image source, and resolution.
"""Calculate optimal number of frames based on event duration, context size,
image source, and resolution.
Token usage varies by resolution: larger images (ultra-wide aspect ratios) use more tokens.
Estimates ~1 token per 1250 pixels. Targets 98% context utilization with safety margin.
Capped at 20 frames.
Per-image token cost is asked of the GenAI provider so providers that know
their model's true cost (e.g. llama.cpp can probe the loaded mmproj) can
diverge from the default ~1-token-per-1250-pixels heuristic. The frame
budget is bounded by:
- remaining context window after prompt + response reservations
- a fixed MAX_IMAGE_TOKENS ceiling
- MAX_FRAMES_PER_SECOND x duration, to avoid drowning short events in
near-duplicate frames where the model latches onto the redundant middle
and skips the start/end action
"""
client = self.genai_manager.description_client
@ -105,14 +115,15 @@ class ReviewDescriptionProcessor(PostProcessorApi):
width = target_width
height = int(target_width / aspect_ratio)
pixels_per_image = width * height
tokens_per_image = pixels_per_image / 1250
tokens_per_image = client.estimate_image_tokens(width, height)
prompt_tokens = 3800
response_tokens = 300
available_tokens = context_size - prompt_tokens - response_tokens
max_frames = int(available_tokens / tokens_per_image)
return min(max(max_frames, 3), 20)
context_budget = context_size - prompt_tokens - response_tokens
image_token_budget = min(context_budget, MAX_IMAGE_TOKENS)
max_frames_by_tokens = int(image_token_budget / tokens_per_image)
max_frames_by_duration = int(duration * MAX_FRAMES_PER_SECOND)
max_frames = min(max_frames_by_tokens, max_frames_by_duration)
return max(max_frames, 3)
def process_data(
self, data: dict[str, Any], data_type: PostProcessDataEnum
@ -376,7 +387,9 @@ class ReviewDescriptionProcessor(PostProcessorApi):
all_frames.append(os.path.join(preview_dir, file))
frame_count = len(all_frames)
desired_frame_count = self.calculate_frame_count(camera)
desired_frame_count = self.calculate_frame_count(
camera, duration=end_time - start_time
)
if frame_count <= desired_frame_count:
return all_frames
@ -400,7 +413,7 @@ class ReviewDescriptionProcessor(PostProcessorApi):
"""Get frames from recordings at specified timestamps."""
duration = end_time - start_time
desired_frame_count = self.calculate_frame_count(
camera, ImageSourceEnum.recordings, height
camera, duration, ImageSourceEnum.recordings, height
)
# Calculate evenly spaced timestamps throughout the duration

View File

@ -4,6 +4,10 @@ from pydantic import BaseModel, ConfigDict, Field
class ReviewMetadata(BaseModel):
model_config = ConfigDict(extra="ignore", protected_namespaces=())
observations: list[str] = Field(
default_factory=list,
description="Chronological list of significant observations from the frames, written before the scene narrative is composed.",
)
title: str = Field(
description="A short title characterizing what took place and where, under 10 words."
)

View File

@ -163,6 +163,38 @@ Each line represents a detection state, not necessarily unique individuals. The
if prop is not None:
prop.update(hints)
# observations is a chain-of-thought-by-schema field: forcing the model
# to enumerate concrete facts before writing scene/title surfaces details
# the narrative would otherwise gloss past (e.g. brief vehicle arrivals
# overshadowed by a longer activity). The minItems floor scales with
# event duration so longer clips get more observations.
observations_prop = schema.get("properties", {}).get("observations")
if observations_prop is not None:
duration_seconds = float(review_data.get("duration") or 0)
min_observations = max(3, round(duration_seconds / 5))
max_observations = min_observations + 8
observations_prop["description"] = (
"Enumerate the significant observations across all frames, in "
"chronological order, BEFORE composing the scene narrative. "
"Include the very start of the activity — for example, a "
"vehicle entering the frame or pulling into the driveway — "
"even if it lasts only a few frames and the rest of the clip "
"is dominated by a longer activity. Include each arrival, "
"departure, motion event, object handled, and notable change "
"in position or state. Each item is a single concrete fact "
"written as a complete sentence (e.g., 'A blue sedan turns "
"from the street into the driveway', 'Nick exits the driver "
"side carrying a plant pot'). Do not summarize, interpret, or "
"assign meaning here — that belongs in the scene field."
)
observations_prop["minItems"] = min_observations
observations_prop["maxItems"] = max_observations
observations_prop["items"] = {"type": "string", "minLength": 20}
required = schema.setdefault("required", [])
if "observations" not in required:
required.append("observations")
# OpenAI strict mode requires additionalProperties: false on all objects
schema["additionalProperties"] = False
@ -356,6 +388,14 @@ Guidelines:
"""Get the context window size for this provider in tokens."""
return 4096
def estimate_image_tokens(self, width: int, height: int) -> float:
"""Estimate prompt tokens consumed by a single image of the given dimensions.
Default heuristic: ~1 token per 1250 pixels. Providers that can measure or
know their model's exact image-token cost should override.
"""
return (width * height) / 1250
def embed(
self,
texts: list[str] | None = None,

View File

@ -42,6 +42,8 @@ class LlamaCppClient(GenAIClient):
_supports_vision: bool
_supports_audio: bool
_supports_tools: bool
_image_token_cache: dict[tuple[int, int], int]
_text_baseline_tokens: int | None
def _init_provider(self) -> str | None:
"""Initialize the client and query model metadata from the server."""
@ -52,6 +54,8 @@ class LlamaCppClient(GenAIClient):
self._supports_vision = False
self._supports_audio = False
self._supports_tools = False
self._image_token_cache = {}
self._text_baseline_tokens = None
base_url = (
self.genai_config.base_url.rstrip("/")
@ -272,6 +276,91 @@ class LlamaCppClient(GenAIClient):
return self._context_size
return 4096
def estimate_image_tokens(self, width: int, height: int) -> float:
"""Probe the llama.cpp server to learn the model's image-token cost at the
requested dimensions.
llama.cpp's image tokenization is a deterministic function of dimensions and
the loaded mmproj, so the result is cached per (width, height) for the
lifetime of the process. Falls back to the base pixel heuristic if the
server is unreachable or the response is malformed.
"""
if self.provider is None:
return super().estimate_image_tokens(width, height)
cached = self._image_token_cache.get((width, height))
if cached is not None:
return cached
try:
baseline = self._probe_baseline_tokens()
with_image = self._probe_image_prompt_tokens(width, height)
tokens = max(1, with_image - baseline)
except Exception as e:
logger.debug(
"llama.cpp image-token probe failed for %dx%d (%s); using heuristic",
width,
height,
e,
)
return super().estimate_image_tokens(width, height)
self._image_token_cache[(width, height)] = tokens
logger.debug(
"llama.cpp model '%s' uses ~%d tokens for %dx%d images",
self.genai_config.model,
tokens,
width,
height,
)
return tokens
def _probe_baseline_tokens(self) -> int:
"""Return prompt_tokens for a minimal text-only request. Cached after first call."""
if self._text_baseline_tokens is not None:
return self._text_baseline_tokens
self._text_baseline_tokens = self._probe_prompt_tokens(
[{"type": "text", "text": "."}]
)
return self._text_baseline_tokens
def _probe_image_prompt_tokens(self, width: int, height: int) -> int:
"""Return prompt_tokens for a single synthetic image plus minimal text."""
img = Image.new("RGB", (width, height), (128, 128, 128))
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=60)
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
return self._probe_prompt_tokens(
[
{"type": "text", "text": "."},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
},
]
)
def _probe_prompt_tokens(self, content: list[dict[str, Any]]) -> int:
"""POST a 1-token chat completion and return reported prompt_tokens.
Uses a generous timeout to absorb a cold model load on the first probe
when the server lazily loads models on demand (e.g. llama-swap).
"""
payload = {
"model": self.genai_config.model,
"messages": [{"role": "user", "content": content}],
"max_tokens": 1,
}
response = requests.post(
f"{self.provider}/v1/chat/completions",
json=payload,
timeout=60,
)
response.raise_for_status()
return int(response.json()["usage"]["prompt_tokens"])
def _build_payload(
self,
messages: list[dict[str, Any]],

View File

@ -155,14 +155,40 @@ export function MessageBubble({
) : (
<div
className={cn(
"[&>*:last-child]:inline",
!isComplete &&
"after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
"[&>p:last-child]:inline after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
)}
>
<ReactMarkdown
remarkPlugins={[remarkGfm]}
components={{
p: ({ node: _n, ...props }) => (
<p className="my-2 first:mt-0 last:mb-0" {...props} />
),
ul: ({ node: _n, ...props }) => (
<ul
className="my-2 list-disc space-y-1 pl-6 first:mt-0 last:mb-0"
{...props}
/>
),
ol: ({ node: _n, ...props }) => (
<ol
className="my-2 list-decimal space-y-1 pl-6 first:mt-0 last:mb-0"
{...props}
/>
),
li: ({ node: _n, ...props }) => (
<li className="pl-1" {...props} />
),
code: ({ node: _n, className, ...props }) => (
<code
className={cn(
"rounded bg-foreground/10 px-1 py-0.5 font-mono text-sm",
className,
)}
{...props}
/>
),
table: ({ node: _n, ...props }) => (
<table
className="my-2 w-full border-collapse border border-border"