Merge 0f3dd097ec into 0ea8924727

GenAI Optimizations (#23006 )
* Test for image token usage in llama.cpp so we can more appropriately decide how many frames to include * Limit based on frames per second * handle zone case sensitivity * Improve formatting * Add observations field so model can build CoT before outputting used fields
2026-05-07 22:15:28 +03:00 · 2026-04-26 04:46:56 -06:00 · 2026-04-25 17:38:18 -05:00 · 2026-04-25 09:12:20 -06:00 · 2026-04-17 22:25:46 +02:00 · 2026-04-14 23:14:31 +02:00
18 changed files with 745 additions and 52 deletions
--- a/7
+++ b/7
@ -21,6 +21,13 @@ local: version
 		--tag frigate:latest \
 		--load

+localh10: version
+	docker buildx build --target=frigate --file docker/main/Dockerfile . \
+	    --build-arg HAILORT_VERSION=5.1.1 \
+		--build-arg HAILORT_GIT_REPO=mathieu-d/hailort \
+		--tag frigate:latest \
+		--load
+
 debug: version
 	docker buildx build --target=frigate --file docker/main/Dockerfile . \
 	    --build-arg DEBUG=true \
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -12,6 +12,11 @@ services:
    build:
      context: .
      dockerfile: docker/main/Dockerfile
+      # Use args to specify hailort version and location
+      # args:
+      #   HAILORT_VERSION: "5.1.1"
+      #   HAILORT_GIT_REPO: "mathieu-d/hailort"
+
      # Use target devcontainer-trt for TensorRT dev
      target: devcontainer
      cache_from:
@ -29,6 +34,7 @@ services:
    # devices:
      # - /dev/bus/usb:/dev/bus/usb # Uncomment for Google Coral USB
      # - /dev/dri:/dev/dri # for intel hwaccel, needs to be updated for your hardware
+
    volumes:
      - .:/workspace/frigate:cached
      - ./web/dist:/opt/frigate/web:cached
--- a/docker/hailo10h/user_installation.sh
+++ b/docker/hailo10h/user_installation.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Update package list and install hailo driver version 5.1.1 for Hailo-10H
+sudo apt update
+sudo apt install -y hailo-h10-all=5.1.1
+
+
--- a/docker/main/Dockerfile
+++ b/docker/main/Dockerfile
@ -157,6 +157,8 @@ FROM base AS wheels
 ARG DEBIAN_FRONTEND
 ARG TARGETARCH
 ARG DEBUG=false
+ARG HAILORT_VERSION=4.21.0
+ARG HAILORT_GIT_REPO=frigate-nvr/hailort

 # Use a separate container to build wheels to prevent build dependencies in final image
 RUN apt-get -qq update \
--- a/docker/main/install_hailort.sh
+++ b/docker/main/install_hailort.sh
@ -2,13 +2,11 @@

 set -euxo pipefail

-hailo_version="4.21.0"
-
 if [[ "${TARGETARCH}" == "amd64" ]]; then
    arch="x86_64"
 elif [[ "${TARGETARCH}" == "arm64" ]]; then
    arch="aarch64"
 fi

-wget -qO- "https://github.com/frigate-nvr/hailort/releases/download/v${hailo_version}/hailort-debian12-${TARGETARCH}.tar.gz" | tar -C / -xzf -
-wget -P /wheels/ "https://github.com/frigate-nvr/hailort/releases/download/v${hailo_version}/hailort-${hailo_version}-cp311-cp311-linux_${arch}.whl"
+wget -qO- "https://github.com/${HAILORT_GIT_REPO}/releases/download/v${HAILORT_VERSION}/hailort-debian12-${TARGETARCH}.tar.gz" | tar -C / -xzf -
+wget -P /wheels/ "https://github.com/${HAILORT_GIT_REPO}/releases/download/v${HAILORT_VERSION}/hailort-${HAILORT_VERSION}-cp311-cp311-linux_${arch}.whl"
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@ -36,6 +36,7 @@ from frigate.api.defs.response.chat_response import (
 )
 from frigate.api.defs.tags import Tags
 from frigate.api.event import events
+from frigate.config import FrigateConfig
 from frigate.genai.utils import build_assistant_message_for_conversation
 from frigate.jobs.vlm_watch import (
    get_vlm_watch_job,
@ -401,9 +402,38 @@ def get_tools() -> JSONResponse:
    return JSONResponse(content={"tools": tools})


+def _resolve_zones(
+    zones: List[str],
+    config: FrigateConfig,
+    target_cameras: List[str],
+) -> List[str]:
+    """Map zone names to their canonical config keys, case-insensitively.
+
+    LLMs frequently echo a user's casing ("Front Yard") instead of the
+    configured key ("front_yard"). The downstream zone filter is a SQLite GLOB
+    over the JSON-encoded zones column, which is case-sensitive — so an
+    unnormalized name silently returns zero matches. Build a lookup over the
+    relevant cameras' configured zones and substitute when we find a match;
+    unknown names pass through so behavior matches what the model asked for.
+    """
+    if not zones:
+        return zones
+
+    lookup: Dict[str, str] = {}
+    for camera_id in target_cameras:
+        camera_config = config.cameras.get(camera_id)
+        if camera_config is None:
+            continue
+        for zone_name in camera_config.zones.keys():
+            lookup.setdefault(zone_name.lower(), zone_name)
+
+    return [lookup.get(z.lower(), z) for z in zones]
+
+
 async def _execute_search_objects(
    arguments: Dict[str, Any],
    allowed_cameras: List[str],
+    config: FrigateConfig,
 ) -> JSONResponse:
    """
    Execute the search_objects tool.
@ -437,6 +467,11 @@ async def _execute_search_objects(
    # Convert zones array to comma-separated string if provided
    zones = arguments.get("zones")
    if isinstance(zones, list):
+        camera_arg = arguments.get("camera")
+        target_cameras = (
+            [camera_arg] if camera_arg and camera_arg != "all" else allowed_cameras
+        )
+        zones = _resolve_zones(zones, config, target_cameras)
        zones = ",".join(zones)
    elif zones is None:
        zones = "all"
@ -528,6 +563,11 @@ async def _execute_find_similar_objects(
    sub_labels = arguments.get("sub_labels")
    zones = arguments.get("zones")

+    if zones:
+        zones = _resolve_zones(
+            zones, request.app.frigate_config, cameras or list(allowed_cameras)
+        )
+
    similarity_mode = arguments.get("similarity_mode", "fused")
    if similarity_mode not in ("visual", "semantic", "fused"):
        similarity_mode = "fused"
@ -655,7 +695,9 @@ async def execute_tool(
    logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")

    if tool_name == "search_objects":
-        return await _execute_search_objects(arguments, allowed_cameras)
+        return await _execute_search_objects(
+            arguments, allowed_cameras, request.app.frigate_config
+        )

    if tool_name == "find_similar_objects":
        result = await _execute_find_similar_objects(
@ -835,7 +877,9 @@ async def _execute_tool_internal(
    This is used by the chat completion endpoint to execute tools.
    """
    if tool_name == "search_objects":
-        response = await _execute_search_objects(arguments, allowed_cameras)
+        response = await _execute_search_objects(
+            arguments, allowed_cameras, request.app.frigate_config
+        )
        try:
            if hasattr(response, "body"):
                body_str = response.body.decode("utf-8")
@ -899,6 +943,9 @@ async def _execute_start_camera_watch(

    await require_camera_access(camera, request=request)

+    if zones:
+        zones = _resolve_zones(zones, config, [camera])
+
    genai_manager = request.app.genai_manager
    chat_client = genai_manager.chat_client
    if chat_client is None or not chat_client.supports_vision:
--- a/frigate/data_processing/post/review_descriptions.py
+++ b/frigate/data_processing/post/review_descriptions.py
@ -39,6 +39,8 @@ logger = logging.getLogger(__name__)

 RECORDING_BUFFER_EXTENSION_PERCENT = 0.10
 MIN_RECORDING_DURATION = 10
+MAX_IMAGE_TOKENS = 24000
+MAX_FRAMES_PER_SECOND = 2


 class ReviewDescriptionProcessor(PostProcessorApi):
@ -60,14 +62,22 @@ class ReviewDescriptionProcessor(PostProcessorApi):
    def calculate_frame_count(
        self,
        camera: str,
+        duration: float,
        image_source: ImageSourceEnum = ImageSourceEnum.preview,
        height: int = 480,
    ) -> int:
-        """Calculate optimal number of frames based on context size, image source, and resolution.
+        """Calculate optimal number of frames based on event duration, context size,
+        image source, and resolution.

-        Token usage varies by resolution: larger images (ultra-wide aspect ratios) use more tokens.
-        Estimates ~1 token per 1250 pixels. Targets 98% context utilization with safety margin.
-        Capped at 20 frames.
+        Per-image token cost is asked of the GenAI provider so providers that know
+        their model's true cost (e.g. llama.cpp can probe the loaded mmproj) can
+        diverge from the default ~1-token-per-1250-pixels heuristic. The frame
+        budget is bounded by:
+          - remaining context window after prompt + response reservations
+          - a fixed MAX_IMAGE_TOKENS ceiling
+          - MAX_FRAMES_PER_SECOND x duration, to avoid drowning short events in
+            near-duplicate frames where the model latches onto the redundant middle
+            and skips the start/end action
        """
        client = self.genai_manager.description_client

@ -105,14 +115,15 @@ class ReviewDescriptionProcessor(PostProcessorApi):
                width = target_width
                height = int(target_width / aspect_ratio)

-        pixels_per_image = width * height
-        tokens_per_image = pixels_per_image / 1250
+        tokens_per_image = client.estimate_image_tokens(width, height)
        prompt_tokens = 3800
        response_tokens = 300
-        available_tokens = context_size - prompt_tokens - response_tokens
-        max_frames = int(available_tokens / tokens_per_image)
-
-        return min(max(max_frames, 3), 20)
+        context_budget = context_size - prompt_tokens - response_tokens
+        image_token_budget = min(context_budget, MAX_IMAGE_TOKENS)
+        max_frames_by_tokens = int(image_token_budget / tokens_per_image)
+        max_frames_by_duration = int(duration * MAX_FRAMES_PER_SECOND)
+        max_frames = min(max_frames_by_tokens, max_frames_by_duration)
+        return max(max_frames, 3)

    def process_data(
        self, data: dict[str, Any], data_type: PostProcessDataEnum
@ -376,7 +387,9 @@ class ReviewDescriptionProcessor(PostProcessorApi):
            all_frames.append(os.path.join(preview_dir, file))

        frame_count = len(all_frames)
-        desired_frame_count = self.calculate_frame_count(camera)
+        desired_frame_count = self.calculate_frame_count(
+            camera, duration=end_time - start_time
+        )

        if frame_count <= desired_frame_count:
            return all_frames
@ -400,7 +413,7 @@ class ReviewDescriptionProcessor(PostProcessorApi):
        """Get frames from recordings at specified timestamps."""
        duration = end_time - start_time
        desired_frame_count = self.calculate_frame_count(
-            camera, ImageSourceEnum.recordings, height
+            camera, duration, ImageSourceEnum.recordings, height
        )

        # Calculate evenly spaced timestamps throughout the duration
--- a/frigate/data_processing/post/types.py
+++ b/frigate/data_processing/post/types.py
@ -4,12 +4,14 @@ from pydantic import BaseModel, ConfigDict, Field
 class ReviewMetadata(BaseModel):
    model_config = ConfigDict(extra="ignore", protected_namespaces=())

+    observations: list[str] = Field(
+        default_factory=list,
+        description="Chronological list of significant observations from the frames, written before the scene narrative is composed.",
+    )
    title: str = Field(
        description="A short title characterizing what took place and where, under 10 words."
    )
    scene: str = Field(
-        min_length=120,
-        max_length=600,
        description="A chronological narrative of what happens from start to finish.",
    )
    shortSummary: str = Field(
--- a/frigate/detectors/plugins/hailo10h.py
+++ b/frigate/detectors/plugins/hailo10h.py
@ -0,0 +1,415 @@
+import logging
+import os
+import subprocess
+import threading
+import urllib.request
+from functools import partial
+from typing import Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+from pydantic import ConfigDict, Field
+from typing_extensions import Literal
+
+from frigate.const import MODEL_CACHE_DIR
+from frigate.detectors.detection_api import DetectionApi
+from frigate.detectors.detector_config import (
+    BaseDetectorConfig,
+)
+from frigate.object_detection.util import RequestStore, ResponseStore
+
+logger = logging.getLogger(__name__)
+
+
+# ----------------- Utility Functions ----------------- #
+
+
+def preprocess_tensor(image: np.ndarray, model_w: int, model_h: int) -> np.ndarray:
+    """
+    Resize an image with unchanged aspect ratio using padding.
+    Assumes input image shape is (H, W, 3).
+    """
+    if image.ndim == 4 and image.shape[0] == 1:
+        image = image[0]
+
+    h, w = image.shape[:2]
+    scale = min(model_w / w, model_h / h)
+    new_w, new_h = int(w * scale), int(h * scale)
+    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+    padded_image = np.full((model_h, model_w, 3), 114, dtype=image.dtype)
+    x_offset = (model_w - new_w) // 2
+    y_offset = (model_h - new_h) // 2
+    padded_image[y_offset : y_offset + new_h, x_offset : x_offset + new_w] = (
+        resized_image
+    )
+    return padded_image
+
+
+# ----------------- Global Constants ----------------- #
+DETECTOR_KEY = "hailo10h"
+ARCH = None
+H10H_DEFAULT_MODEL = "yolov6n.hef"
+H10H_DEFAULT_URL = "https://hailo-model-zoo.s3.eu-west-2.amazonaws.com/ModelZoo/Compiled/v5.2.0/hailo10h/yolov6n.hef"
+
+
+def detect_hailo_arch():
+    try:
+        result = subprocess.run(
+            ["hailortcli", "fw-control", "identify"], capture_output=True, text=True
+        )
+        if result.returncode != 0:
+            logger.error(f"Inference error: {result.stderr}")
+            return None
+        for line in result.stdout.split("\n"):
+            if "Device Architecture" in line:
+                if "HAILO10H" in line:
+                    return "hailo10h"
+        logger.error("Inference error: Could not determine Hailo architecture.")
+        return None
+    except Exception as e:
+        logger.error(f"Inference error: {e}")
+        return None
+
+
+# ----------------- HailoAsyncInference Class ----------------- #
+class HailoAsyncInference:
+    def __init__(
+        self,
+        hef_path: str,
+        input_store: RequestStore,
+        output_store: ResponseStore,
+        batch_size: int = 1,
+        input_type: Optional[str] = None,
+        output_type: Optional[Dict[str, str]] = None,
+        send_original_frame: bool = False,
+    ) -> None:
+        # when importing hailo it activates the driver
+        # which leaves processes running even though it may not be used.
+        try:
+            from hailo_platform import (
+                HEF,
+                FormatType,
+                HailoSchedulingAlgorithm,
+                VDevice,
+            )
+        except ModuleNotFoundError:
+            pass
+
+        self.input_store = input_store
+        self.output_store = output_store
+
+        params = VDevice.create_params()
+        params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN
+
+        self.hef = HEF(hef_path)
+        self.target = VDevice(params)
+        self.infer_model = self.target.create_infer_model(hef_path)
+        self.infer_model.set_batch_size(batch_size)
+
+        if input_type is not None:
+            self.infer_model.input().set_format_type(getattr(FormatType, input_type))
+
+        if output_type is not None:
+            for output_name, output_type in output_type.items():
+                self.infer_model.output(output_name).set_format_type(
+                    getattr(FormatType, output_type)
+                )
+
+        self.output_type = output_type
+        self.send_original_frame = send_original_frame
+
+    def callback(
+        self,
+        completion_info,
+        bindings_list: List,
+        input_batch: List,
+        request_ids: List[int],
+    ):
+        if completion_info.exception:
+            logger.error(f"Inference error: {completion_info.exception}")
+        else:
+            for i, bindings in enumerate(bindings_list):
+                if len(bindings._output_names) == 1:
+                    result = bindings.output().get_buffer()
+                else:
+                    result = {
+                        name: np.expand_dims(bindings.output(name).get_buffer(), axis=0)
+                        for name in bindings._output_names
+                    }
+                self.output_store.put(request_ids[i], (input_batch[i], result))
+
+    def _create_bindings(self, configured_infer_model) -> object:
+        if self.output_type is None:
+            output_buffers = {
+                output_info.name: np.empty(
+                    self.infer_model.output(output_info.name).shape,
+                    dtype=getattr(
+                        np, str(output_info.format.type).split(".")[1].lower()
+                    ),
+                )
+                for output_info in self.hef.get_output_vstream_infos()
+            }
+        else:
+            output_buffers = {
+                name: np.empty(
+                    self.infer_model.output(name).shape,
+                    dtype=getattr(np, self.output_type[name].lower()),
+                )
+                for name in self.output_type
+            }
+        return configured_infer_model.create_bindings(output_buffers=output_buffers)
+
+    def get_input_shape(self) -> Tuple[int, ...]:
+        return self.hef.get_input_vstream_infos()[0].shape
+
+    def run(self) -> None:
+        job = None
+        with self.infer_model.configure() as configured_infer_model:
+            while True:
+                batch_data = self.input_store.get()
+
+                if batch_data is None:
+                    break
+
+                request_id, frame_data = batch_data
+                preprocessed_batch = [frame_data]
+                request_ids = [request_id]
+                input_batch = preprocessed_batch  # non-send_original_frame mode
+
+                bindings_list = []
+                for frame in preprocessed_batch:
+                    bindings = self._create_bindings(configured_infer_model)
+                    bindings.input().set_buffer(np.array(frame))
+                    bindings_list.append(bindings)
+                configured_infer_model.wait_for_async_ready(timeout_ms=10000)
+                job = configured_infer_model.run_async(
+                    bindings_list,
+                    partial(
+                        self.callback,
+                        input_batch=input_batch,
+                        request_ids=request_ids,
+                        bindings_list=bindings_list,
+                    ),
+                )
+
+            if job is not None:
+                job.wait(100)
+
+
+# ----------------- HailoDetector Class ----------------- #
+class HailoDetector(DetectionApi):
+    type_key = DETECTOR_KEY
+
+    def __init__(self, detector_config: "HailoDetectorConfig"):
+        global ARCH
+        ARCH = detect_hailo_arch()
+        self.cache_dir = MODEL_CACHE_DIR
+        self.device_type = detector_config.device
+        self.model_height = (
+            detector_config.model.height
+            if hasattr(detector_config.model, "height")
+            else None
+        )
+        self.model_width = (
+            detector_config.model.width
+            if hasattr(detector_config.model, "width")
+            else None
+        )
+        self.model_type = (
+            detector_config.model.model_type
+            if hasattr(detector_config.model, "model_type")
+            else None
+        )
+        self.tensor_format = (
+            detector_config.model.input_tensor
+            if hasattr(detector_config.model, "input_tensor")
+            else None
+        )
+        self.pixel_format = (
+            detector_config.model.input_pixel_format
+            if hasattr(detector_config.model, "input_pixel_format")
+            else None
+        )
+        self.input_dtype = (
+            detector_config.model.input_dtype
+            if hasattr(detector_config.model, "input_dtype")
+            else None
+        )
+        self.output_type = "FLOAT32"
+        self.set_path_and_url(detector_config.model.path)
+        self.working_model_path = self.check_and_prepare()
+
+        self.batch_size = 1
+        self.input_store = RequestStore()
+        self.response_store = ResponseStore()
+
+        try:
+            logger.debug(f"[INIT] Loading HEF model from {self.working_model_path}")
+            self.inference_engine = HailoAsyncInference(
+                self.working_model_path,
+                self.input_store,
+                self.response_store,
+                self.batch_size,
+            )
+            self.input_shape = self.inference_engine.get_input_shape()
+            logger.debug(f"[INIT] Model input shape: {self.input_shape}")
+            self.inference_thread = threading.Thread(
+                target=self.inference_engine.run, daemon=True
+            )
+            self.inference_thread.start()
+        except Exception as e:
+            logger.error(f"[INIT] Failed to initialize HailoAsyncInference: {e}")
+            raise
+
+    def set_path_and_url(self, path: str = None):
+        if not path:
+            self.model_path = None
+            self.url = None
+            return
+        if self.is_url(path):
+            self.url = path
+            self.model_path = None
+        else:
+            self.model_path = path
+            self.url = None
+
+    def is_url(self, url: str) -> bool:
+        return (
+            url.startswith("http://")
+            or url.startswith("https://")
+            or url.startswith("www.")
+        )
+
+    @staticmethod
+    def extract_model_name(path: str = None, url: str = None) -> str:
+        if path and path.endswith(".hef"):
+            return os.path.basename(path)
+        elif url and url.endswith(".hef"):
+            return os.path.basename(url)
+        else:
+            return H10H_DEFAULT_MODEL
+
+    @staticmethod
+    def download_model(url: str, destination: str):
+        if not url.endswith(".hef"):
+            raise ValueError("Invalid model URL. Only .hef files are supported.")
+        try:
+            urllib.request.urlretrieve(url, destination)
+            logger.debug(f"Downloaded model to {destination}")
+        except Exception as e:
+            raise RuntimeError(f"Failed to download model from {url}: {str(e)}")
+
+    def check_and_prepare(self) -> str:
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+        model_name = self.extract_model_name(self.model_path, self.url)
+        cached_model_path = os.path.join(self.cache_dir, model_name)
+        if not self.model_path and not self.url:
+            if os.path.exists(cached_model_path):
+                logger.debug(f"Model found in cache: {cached_model_path}")
+                return cached_model_path
+            else:
+                logger.debug(f"Downloading default model: {model_name}")
+                self.download_model(H10H_DEFAULT_URL, cached_model_path)
+
+        elif self.url:
+            logger.debug(f"Downloading model from URL: {self.url}")
+            self.download_model(self.url, cached_model_path)
+        elif self.model_path:
+            if os.path.exists(self.model_path):
+                logger.debug(f"Using existing model at: {self.model_path}")
+                return self.model_path
+            else:
+                raise FileNotFoundError(f"Model file not found at: {self.model_path}")
+        return cached_model_path
+
+    def detect_raw(self, tensor_input):
+        tensor_input = self.preprocess(tensor_input)
+
+        if isinstance(tensor_input, np.ndarray) and len(tensor_input.shape) == 3:
+            tensor_input = np.expand_dims(tensor_input, axis=0)
+
+        request_id = self.input_store.put(tensor_input)
+
+        try:
+            _, infer_results = self.response_store.get(request_id, timeout=1.0)
+        except TimeoutError:
+            logger.error(
+                f"Timeout waiting for inference results for request {request_id}"
+            )
+
+            if not self.inference_thread.is_alive():
+                raise RuntimeError(
+                    "HailoRT inference thread has stopped, restart required."
+                )
+
+            return np.zeros((20, 6), dtype=np.float32)
+
+        if isinstance(infer_results, list) and len(infer_results) == 1:
+            infer_results = infer_results[0]
+
+        threshold = 0.4
+        all_detections = []
+        for class_id, detection_set in enumerate(infer_results):
+            if not isinstance(detection_set, np.ndarray) or detection_set.size == 0:
+                continue
+            for det in detection_set:
+                if det.shape[0] < 5:
+                    continue
+                score = float(det[4])
+                if score < threshold:
+                    continue
+                all_detections.append([class_id, score, det[0], det[1], det[2], det[3]])
+
+        if len(all_detections) == 0:
+            detections_array = np.zeros((20, 6), dtype=np.float32)
+        else:
+            detections_array = np.array(all_detections, dtype=np.float32)
+            if detections_array.shape[0] > 20:
+                detections_array = detections_array[:20, :]
+            elif detections_array.shape[0] < 20:
+                pad = np.zeros((20 - detections_array.shape[0], 6), dtype=np.float32)
+                detections_array = np.vstack((detections_array, pad))
+
+        return detections_array
+
+    def preprocess(self, image):
+        if isinstance(image, np.ndarray):
+            processed = preprocess_tensor(
+                image, self.input_shape[1], self.input_shape[0]
+            )
+            return np.expand_dims(processed, axis=0)
+        else:
+            raise ValueError("Unsupported image format for preprocessing")
+
+    def close(self):
+        """Properly shuts down the inference engine and releases the VDevice."""
+        logger.debug("[CLOSE] Closing HailoDetector")
+        try:
+            if hasattr(self, "inference_engine"):
+                if hasattr(self.inference_engine, "target"):
+                    self.inference_engine.target.release()
+                logger.debug("Hailo VDevice released successfully")
+        except Exception as e:
+            logger.error(f"Failed to close Hailo device: {e}")
+            raise
+
+    def __del__(self):
+        """Destructor to ensure cleanup when the object is deleted."""
+        self.close()
+
+
+# ----------------- HailoDetectorConfig Class ----------------- #
+class HailoDetectorConfig(BaseDetectorConfig):
+    """Hailo10H detector using HEF models and the HailoRT SDK for inference on Hailo hardware."""
+
+    model_config = ConfigDict(
+        title="Hailo-10H",
+    )
+
+    type: Literal[DETECTOR_KEY]
+    device: str = Field(
+        default="PCIe",
+        title="Device Type",
+        description="The device to use for Hailo inference (e.g. 'PCIe', 'M.2').",
+    )
--- a/frigate/genai/init.py
+++ b/frigate/genai/init.py
@ -151,6 +151,50 @@ Each line represents a detection state, not necessarily unique individuals. The
            if "other_concerns" in schema.get("required", []):
                schema["required"].remove("other_concerns")

+        # Length hints injected into the schema as suggestions to the model
+        # (enforced by grammar-based providers like llama.cpp) but kept off the
+        # Pydantic model so a non-compliant response does not fail validation.
+        length_hints = {
+            "scene": {"minLength": 120, "maxLength": 600},
+            "shortSummary": {"minLength": 70, "maxLength": 100},
+        }
+        for field, hints in length_hints.items():
+            prop = schema.get("properties", {}).get(field)
+            if prop is not None:
+                prop.update(hints)
+
+        # observations is a chain-of-thought-by-schema field: forcing the model
+        # to enumerate concrete facts before writing scene/title surfaces details
+        # the narrative would otherwise gloss past (e.g. brief vehicle arrivals
+        # overshadowed by a longer activity). The minItems floor scales with
+        # event duration so longer clips get more observations.
+        observations_prop = schema.get("properties", {}).get("observations")
+        if observations_prop is not None:
+            duration_seconds = float(review_data.get("duration") or 0)
+            min_observations = max(3, round(duration_seconds / 5))
+            max_observations = min_observations + 8
+            observations_prop["description"] = (
+                "Enumerate the significant observations across all frames, in "
+                "chronological order, BEFORE composing the scene narrative. "
+                "Include the very start of the activity — for example, a "
+                "vehicle entering the frame or pulling into the driveway — "
+                "even if it lasts only a few frames and the rest of the clip "
+                "is dominated by a longer activity. Include each arrival, "
+                "departure, motion event, object handled, and notable change "
+                "in position or state. Each item is a single concrete fact "
+                "written as a complete sentence (e.g., 'A blue sedan turns "
+                "from the street into the driveway', 'Nick exits the driver "
+                "side carrying a plant pot'). Do not summarize, interpret, or "
+                "assign meaning here — that belongs in the scene field."
+            )
+            observations_prop["minItems"] = min_observations
+            observations_prop["maxItems"] = max_observations
+            observations_prop["items"] = {"type": "string", "minLength": 20}
+
+            required = schema.setdefault("required", [])
+            if "observations" not in required:
+                required.append("observations")
+
        # OpenAI strict mode requires additionalProperties: false on all objects
        schema["additionalProperties"] = False

@ -344,6 +388,14 @@ Guidelines:
        """Get the context window size for this provider in tokens."""
        return 4096

+    def estimate_image_tokens(self, width: int, height: int) -> float:
+        """Estimate prompt tokens consumed by a single image of the given dimensions.
+
+        Default heuristic: ~1 token per 1250 pixels. Providers that can measure or
+        know their model's exact image-token cost should override.
+        """
+        return (width * height) / 1250
+
    def embed(
        self,
        texts: list[str] | None = None,
--- a/frigate/genai/llama_cpp.py
+++ b/frigate/genai/llama_cpp.py
@ -42,6 +42,8 @@ class LlamaCppClient(GenAIClient):
    _supports_vision: bool
    _supports_audio: bool
    _supports_tools: bool
+    _image_token_cache: dict[tuple[int, int], int]
+    _text_baseline_tokens: int | None

    def _init_provider(self) -> str | None:
        """Initialize the client and query model metadata from the server."""
@ -52,6 +54,8 @@ class LlamaCppClient(GenAIClient):
        self._supports_vision = False
        self._supports_audio = False
        self._supports_tools = False
+        self._image_token_cache = {}
+        self._text_baseline_tokens = None

        base_url = (
            self.genai_config.base_url.rstrip("/")
@ -272,6 +276,91 @@ class LlamaCppClient(GenAIClient):
            return self._context_size
        return 4096

+    def estimate_image_tokens(self, width: int, height: int) -> float:
+        """Probe the llama.cpp server to learn the model's image-token cost at the
+        requested dimensions.
+
+        llama.cpp's image tokenization is a deterministic function of dimensions and
+        the loaded mmproj, so the result is cached per (width, height) for the
+        lifetime of the process. Falls back to the base pixel heuristic if the
+        server is unreachable or the response is malformed.
+        """
+        if self.provider is None:
+            return super().estimate_image_tokens(width, height)
+
+        cached = self._image_token_cache.get((width, height))
+
+        if cached is not None:
+            return cached
+
+        try:
+            baseline = self._probe_baseline_tokens()
+            with_image = self._probe_image_prompt_tokens(width, height)
+            tokens = max(1, with_image - baseline)
+        except Exception as e:
+            logger.debug(
+                "llama.cpp image-token probe failed for %dx%d (%s); using heuristic",
+                width,
+                height,
+                e,
+            )
+            return super().estimate_image_tokens(width, height)
+
+        self._image_token_cache[(width, height)] = tokens
+        logger.debug(
+            "llama.cpp model '%s' uses ~%d tokens for %dx%d images",
+            self.genai_config.model,
+            tokens,
+            width,
+            height,
+        )
+        return tokens
+
+    def _probe_baseline_tokens(self) -> int:
+        """Return prompt_tokens for a minimal text-only request. Cached after first call."""
+        if self._text_baseline_tokens is not None:
+            return self._text_baseline_tokens
+
+        self._text_baseline_tokens = self._probe_prompt_tokens(
+            [{"type": "text", "text": "."}]
+        )
+        return self._text_baseline_tokens
+
+    def _probe_image_prompt_tokens(self, width: int, height: int) -> int:
+        """Return prompt_tokens for a single synthetic image plus minimal text."""
+        img = Image.new("RGB", (width, height), (128, 128, 128))
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=60)
+        encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
+        return self._probe_prompt_tokens(
+            [
+                {"type": "text", "text": "."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                },
+            ]
+        )
+
+    def _probe_prompt_tokens(self, content: list[dict[str, Any]]) -> int:
+        """POST a 1-token chat completion and return reported prompt_tokens.
+
+        Uses a generous timeout to absorb a cold model load on the first probe
+        when the server lazily loads models on demand (e.g. llama-swap).
+        """
+        payload = {
+            "model": self.genai_config.model,
+            "messages": [{"role": "user", "content": content}],
+            "max_tokens": 1,
+        }
+        response = requests.post(
+            f"{self.provider}/v1/chat/completions",
+            json=payload,
+            timeout=60,
+        )
+        response.raise_for_status()
+        return int(response.json()["usage"]["prompt_tokens"])
+
    def _build_payload(
        self,
        messages: list[dict[str, Any]],
--- a/frigate/stats/util.py
+++ b/frigate/stats/util.py
@ -123,6 +123,15 @@ def get_detector_temperature(
            if index < len(hailo_device_names):
                device_name = hailo_device_names[index]
                return hailo_temps[device_name]
+    elif detector_type == "hailo10h":
+        # Get temperatures for Hailo devices
+        hailo_temps = get_hailo_temps()
+        if hailo_temps:
+            hailo_device_names = sorted(hailo_temps.keys())
+            index = detector_index_by_type.get("hailo10h", 0)
+            if index < len(hailo_device_names):
+                device_name = hailo_device_names[index]
+                return hailo_temps[device_name]
    elif detector_type == "rknn":
        # Rockchip temperatures are handled by the GPU / NPU stats
        # as there are not detector specific temperatures
--- a/web/public/locales/en/common.json
+++ b/web/public/locales/en/common.json
@ -257,6 +257,7 @@
    "export": "Export",
    "actions": "Actions",
    "uiPlayground": "UI Playground",
+    "features": "Features",
    "faceLibrary": "Face Library",
    "classification": "Classification",
    "chat": "Chat",
--- a/web/public/locales/en/config/global.json
+++ b/web/public/locales/en/config/global.json
@ -397,6 +397,14 @@
        "description": "The device to use for Hailo inference (e.g. 'PCIe', 'M.2')."
      }
    },
+    "hailo10h": {
+      "label": "Hailo-10H",
+      "description": "Hailo-10H detector using HEF models and the HailoRT SDK for inference on Hailo hardware.",
+      "device": {
+        "label": "Device Type",
+        "description": "The device to use for Hailo inference (e.g. 'PCIe', 'M.2')."
+      }
+    },   
    "memryx": {
      "label": "MemryX",
      "description": "MemryX MX3 detector that runs compiled DFP models on MemryX accelerators.",
--- a/web/src/components/card/AnimatedEventCard.tsx
+++ b/web/src/components/card/AnimatedEventCard.tsx
@ -161,13 +161,13 @@ export function AnimatedEventCard({
            <TooltipTrigger asChild>
              <Button
                className={cn(
-                  "absolute left-2 top-1 z-40 transition-opacity",
+                  "absolute left-2 top-1 z-40 bg-gray-500 bg-gradient-to-br from-gray-400 to-gray-500 transition-opacity",
                  threatLevel === ThreatLevel.SECURITY_CONCERN &&
-                    "pointer-events-auto bg-severity_alert opacity-100 hover:bg-severity_alert",
+                    "pointer-events-auto opacity-100",
                  threatLevel === ThreatLevel.NEEDS_REVIEW &&
-                    "pointer-events-auto bg-severity_detection opacity-100 hover:bg-severity_detection",
+                    "pointer-events-auto opacity-100",
                  threatLevel === ThreatLevel.NORMAL &&
-                    "pointer-events-none bg-gray-500 bg-gradient-to-br from-gray-400 to-gray-500 opacity-0 group-hover:pointer-events-auto group-hover:opacity-100",
+                    "pointer-events-none opacity-0 group-hover:pointer-events-auto group-hover:opacity-100",
                )}
                size="xs"
                aria-label={t("markAsReviewed")}
--- a/web/src/components/chat/ChatMessage.tsx
+++ b/web/src/components/chat/ChatMessage.tsx
@ -155,14 +155,40 @@ export function MessageBubble({
        ) : (
          <div
            className={cn(
-              "[&>*:last-child]:inline",
              !isComplete &&
-                "after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
+                "[&>p:last-child]:inline after:ml-0.5 after:inline-block after:h-4 after:w-2 after:animate-cursor-blink after:rounded-sm after:bg-foreground after:align-middle after:content-['']",
            )}
          >
            <ReactMarkdown
              remarkPlugins={[remarkGfm]}
              components={{
+                p: ({ node: _n, ...props }) => (
+                  <p className="my-2 first:mt-0 last:mb-0" {...props} />
+                ),
+                ul: ({ node: _n, ...props }) => (
+                  <ul
+                    className="my-2 list-disc space-y-1 pl-6 first:mt-0 last:mb-0"
+                    {...props}
+                  />
+                ),
+                ol: ({ node: _n, ...props }) => (
+                  <ol
+                    className="my-2 list-decimal space-y-1 pl-6 first:mt-0 last:mb-0"
+                    {...props}
+                  />
+                ),
+                li: ({ node: _n, ...props }) => (
+                  <li className="pl-1" {...props} />
+                ),
+                code: ({ node: _n, className, ...props }) => (
+                  <code
+                    className={cn(
+                      "rounded bg-foreground/10 px-1 py-0.5 font-mono text-sm",
+                      className,
+                    )}
+                    {...props}
+                  />
+                ),
                table: ({ node: _n, ...props }) => (
                  <table
                    className="my-2 w-full border-collapse border border-border"
--- a/web/src/components/classification/ClassificationModelWizardDialog.tsx
+++ b/web/src/components/classification/ClassificationModelWizardDialog.tsx
@ -14,7 +14,6 @@ import Step3ChooseExamples, {
  Step3FormData,
 } from "./wizard/Step3ChooseExamples";
 import { cn } from "@/lib/utils";
-import { isDesktop } from "react-device-detect";
 import axios from "axios";

 const OBJECT_STEPS = [
@ -153,13 +152,9 @@ export default function ClassificationModelWizardDialog({
    >
      <DialogContent
        className={cn(
-          "",
-          isDesktop &&
-            wizardState.currentStep == 0 &&
-            "max-h-[90%] overflow-y-auto xl:max-h-[80%]",
-          isDesktop &&
-            wizardState.currentStep > 0 &&
-            "max-h-[90%] max-w-[70%] overflow-y-auto xl:max-h-[80%]",
+          "scrollbar-container max-h-[90%] overflow-y-auto",
+          wizardState.currentStep == 0 && "xl:max-h-[80%]",
+          wizardState.currentStep > 0 && "md:max-w-[70%] xl:max-h-[80%]",
        )}
        onInteractOutside={(e) => {
          e.preventDefault();
--- a/web/src/components/menu/GeneralSettings.tsx
+++ b/web/src/components/menu/GeneralSettings.tsx
@ -6,6 +6,7 @@ import {
  LuLifeBuoy,
  LuList,
  LuLogOut,
+  LuMessageSquare,
  LuMoon,
  LuSquarePen,
  LuScanFace,
@ -482,21 +483,25 @@ export default function GeneralSettings({ className }: GeneralSettingsProps) {
                  </Link>
                </>
              )}
-              {isAdmin && isMobile && config?.face_recognition.enabled && (
-                <>
-                  <Link to="/faces">
-                    <MenuItem
-                      className="flex w-full items-center p-2 text-sm"
-                      aria-label={t("menu.faceLibrary")}
-                    >
-                      <LuScanFace className="mr-2 size-4" />
-                      <span>{t("menu.faceLibrary")}</span>
-                    </MenuItem>
-                  </Link>
-                </>
-              )}
-              {isAdmin && isMobile && (
-                <>
+            </DropdownMenuGroup>
+            {isMobile && isAdmin && (
+              <>
+                <DropdownMenuLabel className="mt-1">
+                  {t("menu.features")}
+                </DropdownMenuLabel>
+                <DropdownMenuSeparator />
+                <DropdownMenuGroup className="flex flex-col">
+                  {config?.face_recognition.enabled && (
+                    <Link to="/faces">
+                      <MenuItem
+                        className="flex w-full items-center p-2 text-sm"
+                        aria-label={t("menu.faceLibrary")}
+                      >
+                        <LuScanFace className="mr-2 size-4" />
+                        <span>{t("menu.faceLibrary")}</span>
+                      </MenuItem>
+                    </Link>
+                  )}
                  <Link to="/classification">
                    <MenuItem
                      className="flex w-full items-center p-2 text-sm"
@ -506,9 +511,20 @@ export default function GeneralSettings({ className }: GeneralSettingsProps) {
                      <span>{t("menu.classification")}</span>
                    </MenuItem>
                  </Link>
-                </>
-              )}
-            </DropdownMenuGroup>
+                  {config?.genai?.model !== "none" && (
+                    <Link to="/chat">
+                      <MenuItem
+                        className="flex w-full items-center p-2 text-sm"
+                        aria-label={t("menu.chat")}
+                      >
+                        <LuMessageSquare className="mr-2 size-4" />
+                        <span>{t("menu.chat")}</span>
+                      </MenuItem>
+                    </Link>
+                  )}
+                </DropdownMenuGroup>
+              </>
+            )}
            <DropdownMenuLabel className={isDesktop ? "mt-3" : "mt-1"}>
              {t("menu.appearance")}
            </DropdownMenuLabel>
Author	SHA1	Message	Date
mathieu-d	d18b7f8f97	Merge `0f3dd097ec` into `0ea8924727`	2026-04-26 04:46:56 -06:00
Nicolas Mowen	0ea8924727	GenAI Optimizations (#23006 ) Some checks are pending CI / AMD64 Build (push) Waiting to run Details CI / ARM Build (push) Waiting to run Details CI / Jetson Jetpack 6 (push) Waiting to run Details CI / AMD64 Extra Build (push) Blocked by required conditions Details CI / ARM Extra Build (push) Blocked by required conditions Details CI / Synaptics Build (push) Blocked by required conditions Details CI / Assemble and push default build (push) Blocked by required conditions Details * Test for image token usage in llama.cpp so we can more appropriately decide how many frames to include * Limit based on frames per second * handle zone case sensitivity * Improve formatting * Add observations field so model can build CoT before outputting used fields	2026-04-25 17:38:18 -05:00
Josh Hawkins	1a1994ca17	Miscellaneous fixes (#23000 ) Some checks are pending CI / AMD64 Build (push) Waiting to run Details CI / ARM Build (push) Waiting to run Details CI / Jetson Jetpack 6 (push) Waiting to run Details CI / AMD64 Extra Build (push) Blocked by required conditions Details CI / ARM Extra Build (push) Blocked by required conditions Details CI / Synaptics Build (push) Blocked by required conditions Details CI / Assemble and push default build (push) Blocked by required conditions Details * ensure classification wizard dialog is scrollable on mobile too * add chat and features group to mobile menu Co-authored-by: Copilot <copilot@github.com> * Set min length for summary too * Don't use orange for review item --------- Co-authored-by: Copilot <copilot@github.com> Co-authored-by: Nicolas Mowen <nickmowen213@gmail.com>	2026-04-25 09:12:20 -06:00
matieu-d	0f3dd097ec	Prepare for pull request. Remove specific configurations	2026-04-17 22:25:46 +02:00
matieu-d	2a4d7e4766	Prepare for pull request. Remove specific configurations	2026-04-14 23:14:31 +02:00
matieu-d	46415ffeb5	Add Hailo-10H detector configuration to global.json	2026-04-14 22:54:58 +02:00
matieu-d	e35ab0b8a1	Add support of temperature reading for hailo 10H	2026-04-14 22:54:58 +02:00
matieu-d	837373547d	H10 support patch	2026-04-14 22:54:58 +02:00