add area

Add error logs for Intel GPU stats
Store hwaccel errors with timeout so it can retry
2026-06-27 06:41:53 +03:00 · 2026-05-27 10:13:05 -05:00 · 2026-05-27 09:09:42 -06:00 · 2026-05-27 09:09:42 -06:00 · 2026-05-27 10:08:57 -05:00 · 2026-05-27 09:58:21 -05:00
4 changed files with 47 additions and 13 deletions
--- a/docs/docs/configuration/custom_classification/object_classification.md
+++ b/docs/docs/configuration/custom_classification/object_classification.md
@ -149,9 +149,16 @@ For more detail, see [Frigate Tip: Best Practices for Training Face and Custom C
 - **The wizard is just the starting point**: You don't need to find and label every class upfront. Missing classes will naturally appear in Recent Classifications, and those images tend to be more valuable because they represent new conditions and edge cases.
 - **Problem framing**: Keep classes visually distinct and relevant to the chosen object types.
 - **Preprocessing**: Ensure examples reflect object crops similar to Frigate's boxes; keep the subject centered.
- **Labels**: Keep label names short and consistent; include a `none` class if you plan to ignore uncertain predictions for sub labels.
+- **Crop size**: Aim for crops of at least 100×100 pixels (a 10,000 pixel area). Crops smaller than ~80×80 get stretched 3-7× by the model's 224×224 input resize and tend to collapse into a generic "blob" region of feature space where identity becomes unreliable. If most of your detections are small because the camera is far from the subject, consider repositioning the camera for closer crops.
+- **Class balance**: Aim to keep your largest class within ~3× the count of your smallest. Beyond that, the model becomes biased toward the dominant class and tends to default borderline predictions to it (the "everything looks like Buddy" failure mode).
 - **Threshold**: Tune `threshold` per model to reduce false assignments. Start at `0.8` and adjust based on validation.

+:::tip `none` works differently from named classes
+
+Named classes work best with visually uniform examples — every Buddy photo should look like Buddy. The `none` class needs the opposite: visual diversity across sizes, framings, and qualities, because at inference it has to absorb everything that isn't one of your named classes. Don't apply the same "only keep large, well-framed images" rule to `none` that you would to a named class. Mix in small crops, partial views, and false positives deliberately - otherwise the model has no signal for "small/ambiguous thing = not one of my known classes" and will force those crops into a named class by default.
+
+:::
+
 ## Debugging Classification Models

 To troubleshoot issues with object classification models, enable debug logging to see detailed information about classification attempts, scores, and consensus calculations.
--- a/frigate/stats/emitter.py
+++ b/frigate/stats/emitter.py
@ -32,7 +32,7 @@ class StatsEmitter(threading.Thread):
        self.config = config
        self.stats_tracking = stats_tracking
        self.stop_event = stop_event
-        self.hwaccel_errors: list[str] = []
+        self.hwaccel_errors: dict[str, float] = {}
        self.stats_history: list[dict[str, Any]] = []

        # create communication for stats
--- a/frigate/stats/util.py
+++ b/frigate/stats/util.py
@ -1,6 +1,7 @@
 """Utilities for stats."""

 import asyncio
+import logging
 import os
 import shutil
 import time
@ -34,6 +35,10 @@ from frigate.util.services import (
 )
 from frigate.version import VERSION

+logger = logging.getLogger(__name__)
+
+HWACCEL_ERROR_COOLDOWN_SECONDS = 3600
+

 def get_latest_version(config: FrigateConfig) -> str:
    if not config.telemetry.version_check:
@ -167,7 +172,9 @@ def get_detector_stats(


 def get_processing_stats(
-    config: FrigateConfig, stats: dict[str, str], hwaccel_errors: list[str]
+    config: FrigateConfig,
+    stats: dict[str, str],
+    hwaccel_errors: dict[str, float],
 ) -> None:
    """Get stats for cpu / gpu."""

@ -206,7 +213,9 @@ async def set_bandwidth_stats(config: FrigateConfig, all_stats: dict[str, Any])


 async def set_gpu_stats(
-    config: FrigateConfig, all_stats: dict[str, Any], hwaccel_errors: list[str]
+    config: FrigateConfig,
+    all_stats: dict[str, Any],
+    hwaccel_errors: dict[str, float],
 ) -> None:
    """Parse GPUs from hwaccel args and use for stats."""
    hwaccel_args = []
@ -231,12 +240,16 @@ async def set_gpu_stats(

    stats: dict[str, dict] = {}
    intel_gpu_collected = False
+    now = time.monotonic()

    for args in hwaccel_args:
-        if args in hwaccel_errors:
-            # known erroring args should automatically return as error
-            stats["error-gpu"] = {"gpu": "", "mem": ""}
-        elif "cuvid" in args or "nvidia" in args:
+        last_error = hwaccel_errors.get(args)
+        if last_error is not None:
+            if now - last_error < HWACCEL_ERROR_COOLDOWN_SECONDS:
+                continue
+            hwaccel_errors.pop(args, None)
+
+        if "cuvid" in args or "nvidia" in args:
            # nvidia GPU
            nvidia_usage = get_nvidia_gpu_stats()

@ -253,7 +266,7 @@ async def set_gpu_stats(

            else:
                stats["nvidia-gpu"] = {"vendor": "nvidia", "gpu": "", "mem": ""}
-                hwaccel_errors.append(args)
+                hwaccel_errors[args] = time.monotonic()
        elif "nvmpi" in args or "jetson" in args:
            # nvidia Jetson
            jetson_usage = get_jetson_stats()
@ -262,7 +275,7 @@ async def set_gpu_stats(
                stats["jetson-gpu"] = {"vendor": "nvidia", **jetson_usage}
            else:
                stats["jetson-gpu"] = {"vendor": "nvidia", "gpu": "", "mem": ""}
-                hwaccel_errors.append(args)
+                hwaccel_errors[args] = time.monotonic()
        elif "qsv" in args or ("vaapi" in args and not is_vaapi_amd_driver()):
            if not config.telemetry.stats.intel_gpu_stats:
                continue
@ -280,7 +293,7 @@ async def set_gpu_stats(
                        stats[name] = entry
                else:
                    stats["intel-gpu"] = {"vendor": "intel", "gpu": "", "mem": ""}
-                    hwaccel_errors.append(args)
+                    hwaccel_errors[args] = time.monotonic()
        elif "vaapi" in args:
            if not config.telemetry.stats.amd_gpu_stats:
                continue
@ -292,7 +305,7 @@ async def set_gpu_stats(
                stats["amd-vaapi"] = {"vendor": "amd", **amd_usage}
            else:
                stats["amd-vaapi"] = {"vendor": "amd", "gpu": "", "mem": ""}
-                hwaccel_errors.append(args)
+                hwaccel_errors[args] = time.monotonic()
        elif "preset-rk" in args:
            rga_usage = get_rockchip_gpu_stats()

@ -328,7 +341,9 @@ async def set_npu_usages(config: FrigateConfig, all_stats: dict[str, Any]) -> No


 def stats_snapshot(
-    config: FrigateConfig, stats_tracking: StatsTrackingTypes, hwaccel_errors: list[str]
+    config: FrigateConfig,
+    stats_tracking: StatsTrackingTypes,
+    hwaccel_errors: dict[str, float],
 ) -> dict[str, Any]:
    """Get a snapshot of the current stats that are being tracked."""
    camera_metrics = stats_tracking["camera_metrics"]
--- a/frigate/util/services.py
+++ b/frigate/util/services.py
@ -416,6 +416,11 @@ def get_intel_gpu_stats(

    snapshot_a = _read_intel_drm_fdinfo(target_pdev)
    if not snapshot_a:
+        logger.warning(
+            "Unable to collect Intel GPU stats: no DRM fdinfo entries found"
+            "%s. Check that /proc is readable and the i915/xe driver is loaded",
+            f" for pdev {target_pdev}" if target_pdev else "",
+        )
        return None

    start = time.monotonic()
@ -424,6 +429,9 @@ def get_intel_gpu_stats(

    snapshot_b = _read_intel_drm_fdinfo(target_pdev)
    if not snapshot_b or elapsed_ns <= 0:
+        logger.warning(
+            "Unable to collect Intel GPU stats: second DRM fdinfo sample was empty"
+        )
        return None

    def _new_engine_pct() -> dict[str, float]:
@ -464,6 +472,10 @@ def get_intel_gpu_stats(
        pid_pct[data_b["pid"]] = pid_pct.get(data_b["pid"], 0.0) + client_total

    if not per_pdev_engine_pct:
+        logger.warning(
+            "Unable to collect Intel GPU stats: no per-engine counters available "
+            "(i915 requires kernel >= 5.19)"
+        )
        return None

    names = intel_gpu_name_resolver.get_names()
Author	SHA1	Message	Date
Josh Hawkins	2c92a38124	add area	2026-05-27 10:13:05 -05:00
Nicolas Mowen	767853636c	Add error logs for Intel GPU stats	2026-05-27 09:09:42 -06:00
Nicolas Mowen	26e25c159d	Store hwaccel errors with timeout so it can retry	2026-05-27 09:09:42 -06:00
Josh Hawkins	fc5fbad047	tweak language	2026-05-27 10:08:57 -05:00
Josh Hawkins	c5bbf6db88	add more tips to object classification docs	2026-05-27 09:58:21 -05:00