mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-01 19:17:41 +03:00
GenAI Optimizations (#23006)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
* Test for image token usage in llama.cpp so we can more appropriately decide how many frames to include * Limit based on frames per second * handle zone case sensitivity * Improve formatting * Add observations field so model can build CoT before outputting used fields
This commit is contained in:
parent
1a1994ca17
commit
0ea8924727
@ -36,6 +36,7 @@ from frigate.api.defs.response.chat_response import (
|
||||
)
|
||||
from frigate.api.defs.tags import Tags
|
||||
from frigate.api.event import events
|
||||
from frigate.config import FrigateConfig
|
||||
from frigate.genai.utils import build_assistant_message_for_conversation
|
||||
from frigate.jobs.vlm_watch import (
|
||||
get_vlm_watch_job,
|
||||
@ -401,9 +402,38 @@ def get_tools() -> JSONResponse:
|
||||
return JSONResponse(content={"tools": tools})
|
||||
|
||||
|
||||
def _resolve_zones(
|
||||
zones: List[str],
|
||||
config: FrigateConfig,
|
||||
target_cameras: List[str],
|
||||
) -> List[str]:
|
||||
"""Map zone names to their canonical config keys, case-insensitively.
|
||||
|
||||
LLMs frequently echo a user's casing ("Front Yard") instead of the
|
||||
configured key ("front_yard"). The downstream zone filter is a SQLite GLOB
|
||||
over the JSON-encoded zones column, which is case-sensitive — so an
|
||||
unnormalized name silently returns zero matches. Build a lookup over the
|
||||
relevant cameras' configured zones and substitute when we find a match;
|
||||
unknown names pass through so behavior matches what the model asked for.
|
||||
"""
|
||||
if not zones:
|
||||
return zones
|
||||
|
||||
lookup: Dict[str, str] = {}
|
||||
for camera_id in target_cameras:
|
||||
camera_config = config.cameras.get(camera_id)
|
||||
if camera_config is None:
|
||||
continue
|
||||
for zone_name in camera_config.zones.keys():
|
||||
lookup.setdefault(zone_name.lower(), zone_name)
|
||||
|
||||
return [lookup.get(z.lower(), z) for z in zones]
|
||||
|
||||
|
||||
async def _execute_search_objects(
|
||||
arguments: Dict[str, Any],
|
||||
allowed_cameras: List[str],
|
||||
config: FrigateConfig,
|
||||
) -> JSONResponse:
|
||||
"""
|
||||
Execute the search_objects tool.
|
||||
@ -437,6 +467,11 @@ async def _execute_search_objects(
|
||||
# Convert zones array to comma-separated string if provided
|
||||
zones = arguments.get("zones")
|
||||
if isinstance(zones, list):
|
||||
camera_arg = arguments.get("camera")
|
||||
target_cameras = (
|
||||
[camera_arg] if camera_arg and camera_arg != "all" else allowed_cameras
|
||||
)
|
||||
zones = _resolve_zones(zones, config, target_cameras)
|
||||
zones = ",".join(zones)
|
||||
elif zones is None:
|
||||
zones = "all"
|
||||
@ -528,6 +563,11 @@ async def _execute_find_similar_objects(
|
||||
sub_labels = arguments.get("sub_labels")
|
||||
zones = arguments.get("zones")
|
||||
|
||||
if zones:
|
||||
zones = _resolve_zones(
|
||||
zones, request.app.frigate_config, cameras or list(allowed_cameras)
|
||||
)
|
||||
|
||||
similarity_mode = arguments.get("similarity_mode", "fused")
|
||||
if similarity_mode not in ("visual", "semantic", "fused"):
|
||||
similarity_mode = "fused"
|
||||
@ -655,7 +695,9 @@ async def execute_tool(
|
||||
logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")
|
||||
|
||||
if tool_name == "search_objects":
|
||||
return await _execute_search_objects(arguments, allowed_cameras)
|
||||
return await _execute_search_objects(
|
||||
arguments, allowed_cameras, request.app.frigate_config
|
||||
)
|
||||
|
||||
if tool_name == "find_similar_objects":
|
||||
result = await _execute_find_similar_objects(
|
||||
@ -835,7 +877,9 @@ async def _execute_tool_internal(
|
||||
This is used by the chat completion endpoint to execute tools.
|
||||
"""
|
||||
if tool_name == "search_objects":
|
||||
response = await _execute_search_objects(arguments, allowed_cameras)
|
||||
response = await _execute_search_objects(
|
||||
arguments, allowed_cameras, request.app.frigate_config
|
||||
)
|
||||
try:
|
||||
if hasattr(response, "body"):
|
||||
body_str = response.body.decode("utf-8")
|
||||
@ -899,6 +943,9 @@ async def _execute_start_camera_watch(
|
||||
|
||||
await require_camera_access(camera, request=request)
|
||||
|
||||
if zones:
|
||||
zones = _resolve_zones(zones, config, [camera])
|
||||
|
||||
genai_manager = request.app.genai_manager
|
||||
chat_client = genai_manager.chat_client
|
||||
if chat_client is None or not chat_client.supports_vision:
|
||||
|
||||
@ -39,6 +39,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
RECORDING_BUFFER_EXTENSION_PERCENT = 0.10
|
||||
MIN_RECORDING_DURATION = 10
|
||||
MAX_IMAGE_TOKENS = 24000
|
||||
MAX_FRAMES_PER_SECOND = 2
|
||||
|
||||
|
||||
class ReviewDescriptionProcessor(PostProcessorApi):
|
||||
@ -60,14 +62,22 @@ class ReviewDescriptionProcessor(PostProcessorApi):
|
||||
def calculate_frame_count(
|
||||
self,
|
||||
camera: str,
|
||||
duration: float,
|
||||
image_source: ImageSourceEnum = ImageSourceEnum.preview,
|
||||
height: int = 480,
|
||||
) -> int:
|
||||
"""Calculate optimal number of frames based on context size, image source, and resolution.
|
||||
"""Calculate optimal number of frames based on event duration, context size,
|
||||
image source, and resolution.
|
||||
|
||||
Token usage varies by resolution: larger images (ultra-wide aspect ratios) use more tokens.
|
||||
Estimates ~1 token per 1250 pixels. Targets 98% context utilization with safety margin.
|
||||
Capped at 20 frames.
|
||||
Per-image token cost is asked of the GenAI provider so providers that know
|
||||
their model's true cost (e.g. llama.cpp can probe the loaded mmproj) can
|
||||
diverge from the default ~1-token-per-1250-pixels heuristic. The frame
|
||||
budget is bounded by:
|
||||
- remaining context window after prompt + response reservations
|
||||
- a fixed MAX_IMAGE_TOKENS ceiling
|
||||
- MAX_FRAMES_PER_SECOND x duration, to avoid drowning short events in
|
||||
near-duplicate frames where the model latches onto the redundant middle
|
||||
and skips the start/end action
|
||||
"""
|
||||
client = self.genai_manager.description_client
|
||||
|
||||
@ -105,14 +115,15 @@ class ReviewDescriptionProcessor(PostProcessorApi):
|
||||
width = target_width
|
||||
height = int(target_width / aspect_ratio)
|
||||
|
||||
pixels_per_image = width * height
|
||||
tokens_per_image = pixels_per_image / 1250
|
||||
tokens_per_image = client.estimate_image_tokens(width, height)
|
||||
prompt_tokens = 3800
|
||||
response_tokens = 300
|
||||
available_tokens = context_size - prompt_tokens - response_tokens
|
||||
max_frames = int(available_tokens / tokens_per_image)
|
||||
|
||||
return min(max(max_frames, 3), 20)
|
||||
context_budget = context_size - prompt_tokens - response_tokens
|
||||
image_token_budget = min(context_budget, MAX_IMAGE_TOKENS)
|
||||
max_frames_by_tokens = int(image_token_budget / tokens_per_image)
|
||||
max_frames_by_duration = int(duration * MAX_FRAMES_PER_SECOND)
|
||||
max_frames = min(max_frames_by_tokens, max_frames_by_duration)
|
||||
return max(max_frames, 3)
|
||||
|
||||
def process_data(
|
||||
self, data: dict[str, Any], data_type: PostProcessDataEnum
|
||||
@ -376,7 +387,9 @@ class ReviewDescriptionProcessor(PostProcessorApi):
|
||||
all_frames.append(os.path.join(preview_dir, file))
|
||||
|
||||
frame_count = len(all_frames)
|
||||
desired_frame_count = self.calculate_frame_count(camera)
|
||||
desired_frame_count = self.calculate_frame_count(
|
||||
camera, duration=end_time - start_time
|
||||
)
|
||||
|
||||
if frame_count <= desired_frame_count:
|
||||
return all_frames
|
||||
@ -400,7 +413,7 @@ class ReviewDescriptionProcessor(PostProcessorApi):
|
||||
"""Get frames from recordings at specified timestamps."""
|
||||
duration = end_time - start_time
|
||||
desired_frame_count = self.calculate_frame_count(
|
||||
camera, ImageSourceEnum.recordings, height
|
||||
camera, duration, ImageSourceEnum.recordings, height
|
||||
)
|
||||
|
||||
# Calculate evenly spaced timestamps throughout the duration
|
||||
|
||||
@ -4,6 +4,10 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
class ReviewMetadata(BaseModel):
|
||||
model_config = ConfigDict(extra="ignore", protected_namespaces=())
|
||||
|
||||
observations: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Chronological list of significant observations from the frames, written before the scene narrative is composed.",
|
||||
)
|
||||
title: str = Field(
|
||||
description="A short title characterizing what took place and where, under 10 words."
|
||||
)
|
||||
|
||||
@ -163,6 +163,38 @@ Each line represents a detection state, not necessarily unique individuals. The
|
||||
if prop is not None:
|
||||
prop.update(hints)
|
||||
|
||||
# observations is a chain-of-thought-by-schema field: forcing the model
|
||||
# to enumerate concrete facts before writing scene/title surfaces details
|
||||
# the narrative would otherwise gloss past (e.g. brief vehicle arrivals
|
||||
# overshadowed by a longer activity). The minItems floor scales with
|
||||
# event duration so longer clips get more observations.
|
||||
observations_prop = schema.get("properties", {}).get("observations")
|
||||
if observations_prop is not None:
|
||||
duration_seconds = float(review_data.get("duration") or 0)
|
||||
min_observations = max(3, round(duration_seconds / 5))
|
||||
max_observations = min_observations + 8
|
||||
observations_prop["description"] = (
|
||||
"Enumerate the significant observations across all frames, in "
|
||||
"chronological order, BEFORE composing the scene narrative. "
|
||||
"Include the very start of the activity — for example, a "
|
||||
"vehicle entering the frame or pulling into the driveway — "
|
||||
"even if it lasts only a few frames and the rest of the clip "
|
||||
"is dominated by a longer activity. Include each arrival, "
|
||||
"departure, motion event, object handled, and notable change "
|
||||
"in position or state. Each item is a single concrete fact "
|
||||
"written as a complete sentence (e.g., 'A blue sedan turns "
|
||||
"from the street into the driveway', 'Nick exits the driver "
|
||||
"side carrying a plant pot'). Do not summarize, interpret, or "
|
||||
"assign meaning here — that belongs in the scene field."
|
||||
)
|
||||
observations_prop["minItems"] = min_observations
|
||||
observations_prop["maxItems"] = max_observations
|
||||
observations_prop["items"] = {"type": "string", "minLength": 20}
|
||||
|
||||
required = schema.setdefault("required", [])
|
||||
if "observations" not in required:
|
||||
required.append("observations")
|
||||
|
||||
# OpenAI strict mode requires additionalProperties: false on all objects
|
||||
schema["additionalProperties"] = False
|
||||
|
||||
@ -356,6 +388,14 @@ Guidelines:
|
||||
"""Get the context window size for this provider in tokens."""
|
||||
return 4096
|
||||
|
||||
def estimate_image_tokens(self, width: int, height: int) -> float:
|
||||
"""Estimate prompt tokens consumed by a single image of the given dimensions.
|
||||
|
||||
Default heuristic: ~1 token per 1250 pixels. Providers that can measure or
|
||||
know their model's exact image-token cost should override.
|
||||
"""
|
||||
return (width * height) / 1250
|
||||
|
||||
def embed(
|
||||
self,
|
||||
texts: list[str] | None = None,
|
||||
|
||||
@ -42,6 +42,8 @@ class LlamaCppClient(GenAIClient):
|
||||
_supports_vision: bool
|
||||
_supports_audio: bool
|
||||
_supports_tools: bool
|
||||
_image_token_cache: dict[tuple[int, int], int]
|
||||
_text_baseline_tokens: int | None
|
||||
|
||||
def _init_provider(self) -> str | None:
|
||||
"""Initialize the client and query model metadata from the server."""
|
||||
@ -52,6 +54,8 @@ class LlamaCppClient(GenAIClient):
|
||||
self._supports_vision = False
|
||||
self._supports_audio = False
|
||||
self._supports_tools = False
|
||||
self._image_token_cache = {}
|
||||
self._text_baseline_tokens = None
|
||||
|
||||
base_url = (
|
||||
self.genai_config.base_url.rstrip("/")
|
||||
@ -272,6 +276,91 @@ class LlamaCppClient(GenAIClient):
|
||||
return self._context_size
|
||||
return 4096
|
||||
|
||||
def estimate_image_tokens(self, width: int, height: int) -> float:
|
||||
"""Probe the llama.cpp server to learn the model's image-token cost at the
|
||||
requested dimensions.
|
||||
|
||||
llama.cpp's image tokenization is a deterministic function of dimensions and
|
||||
the loaded mmproj, so the result is cached per (width, height) for the
|
||||
lifetime of the process. Falls back to the base pixel heuristic if the
|
||||
server is unreachable or the response is malformed.
|
||||
"""
|
||||
if self.provider is None:
|
||||
return super().estimate_image_tokens(width, height)
|
||||
|
||||
cached = self._image_token_cache.get((width, height))
|
||||
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
try:
|
||||
baseline = self._probe_baseline_tokens()
|
||||
with_image = self._probe_image_prompt_tokens(width, height)
|
||||
tokens = max(1, with_image - baseline)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"llama.cpp image-token probe failed for %dx%d (%s); using heuristic",
|
||||
width,
|
||||
height,
|
||||
e,
|
||||
)
|
||||
return super().estimate_image_tokens(width, height)
|
||||
|
||||
self._image_token_cache[(width, height)] = tokens
|
||||
logger.debug(
|
||||
"llama.cpp model '%s' uses ~%d tokens for %dx%d images",
|
||||
self.genai_config.model,
|
||||
tokens,
|
||||
width,
|
||||
height,
|
||||
)
|
||||
return tokens
|
||||
|
||||
def _probe_baseline_tokens(self) -> int:
|
||||
"""Return prompt_tokens for a minimal text-only request. Cached after first call."""
|
||||
if self._text_baseline_tokens is not None:
|
||||
return self._text_baseline_tokens
|
||||
|
||||
self._text_baseline_tokens = self._probe_prompt_tokens(
|
||||
[{"type": "text", "text": "."}]
|
||||
)
|
||||
return self._text_baseline_tokens
|
||||
|
||||
def _probe_image_prompt_tokens(self, width: int, height: int) -> int:
|
||||
"""Return prompt_tokens for a single synthetic image plus minimal text."""
|
||||
img = Image.new("RGB", (width, height), (128, 128, 128))
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG", quality=60)
|
||||
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||||
return self._probe_prompt_tokens(
|
||||
[
|
||||
{"type": "text", "text": "."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
def _probe_prompt_tokens(self, content: list[dict[str, Any]]) -> int:
|
||||
"""POST a 1-token chat completion and return reported prompt_tokens.
|
||||
|
||||
Uses a generous timeout to absorb a cold model load on the first probe
|
||||
when the server lazily loads models on demand (e.g. llama-swap).
|
||||
"""
|
||||
payload = {
|
||||
"model": self.genai_config.model,
|
||||
"messages": [{"role": "user", "content": content}],
|
||||
"max_tokens": 1,
|
||||
}
|
||||
response = requests.post(
|
||||
f"{self.provider}/v1/chat/completions",
|
||||
json=payload,
|
||||
timeout=60,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return int(response.json()["usage"]["prompt_tokens"])
|
||||
|
||||
def _build_payload(
|
||||
self,
|
||||
messages: list[dict[str, Any]],
|
||||
|
||||
@ -155,14 +155,40 @@ export function MessageBubble({
|
||||
) : (
|
||||
<div
|
||||
className={cn(
|
||||
"[&>*:last-child]:inline",
|
||||
!isComplete &&
|
||||
"after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
|
||||
"[&>p:last-child]:inline after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
|
||||
)}
|
||||
>
|
||||
<ReactMarkdown
|
||||
remarkPlugins={[remarkGfm]}
|
||||
components={{
|
||||
p: ({ node: _n, ...props }) => (
|
||||
<p className="my-2 first:mt-0 last:mb-0" {...props} />
|
||||
),
|
||||
ul: ({ node: _n, ...props }) => (
|
||||
<ul
|
||||
className="my-2 list-disc space-y-1 pl-6 first:mt-0 last:mb-0"
|
||||
{...props}
|
||||
/>
|
||||
),
|
||||
ol: ({ node: _n, ...props }) => (
|
||||
<ol
|
||||
className="my-2 list-decimal space-y-1 pl-6 first:mt-0 last:mb-0"
|
||||
{...props}
|
||||
/>
|
||||
),
|
||||
li: ({ node: _n, ...props }) => (
|
||||
<li className="pl-1" {...props} />
|
||||
),
|
||||
code: ({ node: _n, className, ...props }) => (
|
||||
<code
|
||||
className={cn(
|
||||
"rounded bg-foreground/10 px-1 py-0.5 font-mono text-sm",
|
||||
className,
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
),
|
||||
table: ({ node: _n, ...props }) => (
|
||||
<table
|
||||
className="my-2 w-full border-collapse border border-border"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user