From 831cfc24447b521e4d201d28135e4f3eab6fb9d1 Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Sun, 29 Mar 2026 11:09:02 -0600 Subject: [PATCH] Refactor Intel Stats (#22674) * Improve Intel stats collection * Update handling of stats to be simpler * Simplify handling * More accurately label Intel stats * Cleanup * Remove --- frigate/stats/prometheus.py | 21 +++++ frigate/stats/util.py | 46 ++++------- frigate/test/test_gpu_stats.py | 8 +- frigate/util/services.py | 103 +++++++++++++++--------- web/public/locales/en/views/system.json | 2 + web/src/components/Statusbar.tsx | 3 +- web/src/types/stats.ts | 2 + web/src/views/system/GeneralMetrics.tsx | 76 +++++++++++++++-- 8 files changed, 180 insertions(+), 81 deletions(-) diff --git a/frigate/stats/prometheus.py b/frigate/stats/prometheus.py index 67d8d03d8..d2e229568 100644 --- a/frigate/stats/prometheus.py +++ b/frigate/stats/prometheus.py @@ -355,16 +355,37 @@ class CustomCollector(object): gpu_mem_usages = GaugeMetricFamily( "frigate_gpu_mem_usage_percent", "GPU memory usage %", labels=["gpu_name"] ) + gpu_enc_usages = GaugeMetricFamily( + "frigate_gpu_encoder_usage_percent", + "GPU encoder utilisation %", + labels=["gpu_name"], + ) + gpu_compute_usages = GaugeMetricFamily( + "frigate_gpu_compute_usage_percent", + "GPU compute / encode utilisation %", + labels=["gpu_name"], + ) + gpu_dec_usages = GaugeMetricFamily( + "frigate_gpu_decoder_usage_percent", + "GPU decoder utilisation %", + labels=["gpu_name"], + ) try: for gpu_name, gpu_stats in stats["gpu_usages"].items(): self.add_metric(gpu_usages, [gpu_name], gpu_stats, "gpu") self.add_metric(gpu_mem_usages, [gpu_name], gpu_stats, "mem") + self.add_metric(gpu_enc_usages, [gpu_name], gpu_stats, "enc") + self.add_metric(gpu_compute_usages, [gpu_name], gpu_stats, "compute") + self.add_metric(gpu_dec_usages, [gpu_name], gpu_stats, "dec") except KeyError: pass yield gpu_usages yield gpu_mem_usages + yield gpu_enc_usages + yield gpu_compute_usages + yield gpu_dec_usages # service stats uptime_seconds = GaugeMetricFamily( diff --git a/frigate/stats/util.py b/frigate/stats/util.py index 708f6c5ed..99e9981bd 100644 --- a/frigate/stats/util.py +++ b/frigate/stats/util.py @@ -261,45 +261,33 @@ async def set_gpu_stats( else: stats["jetson-gpu"] = {"gpu": "", "mem": ""} hwaccel_errors.append(args) - elif "qsv" in args: + elif "qsv" in args or ("vaapi" in args and not is_vaapi_amd_driver()): if not config.telemetry.stats.intel_gpu_stats: continue - # intel QSV GPU - intel_usage = get_intel_gpu_stats(config.telemetry.stats.intel_gpu_device) - - if intel_usage is not None: - stats["intel-qsv"] = intel_usage or {"gpu": "", "mem": ""} - else: - stats["intel-qsv"] = {"gpu": "", "mem": ""} - hwaccel_errors.append(args) - elif "vaapi" in args: - if is_vaapi_amd_driver(): - if not config.telemetry.stats.amd_gpu_stats: - continue - - # AMD VAAPI GPU - amd_usage = get_amd_gpu_stats() - - if amd_usage: - stats["amd-vaapi"] = amd_usage - else: - stats["amd-vaapi"] = {"gpu": "", "mem": ""} - hwaccel_errors.append(args) - else: - if not config.telemetry.stats.intel_gpu_stats: - continue - - # intel VAAPI GPU + if "intel-gpu" not in stats: + # intel GPU (QSV or VAAPI both use the same physical GPU) intel_usage = get_intel_gpu_stats( config.telemetry.stats.intel_gpu_device ) if intel_usage is not None: - stats["intel-vaapi"] = intel_usage or {"gpu": "", "mem": ""} + stats["intel-gpu"] = intel_usage or {"gpu": "", "mem": ""} else: - stats["intel-vaapi"] = {"gpu": "", "mem": ""} + stats["intel-gpu"] = {"gpu": "", "mem": ""} hwaccel_errors.append(args) + elif "vaapi" in args: + if not config.telemetry.stats.amd_gpu_stats: + continue + + # AMD VAAPI GPU + amd_usage = get_amd_gpu_stats() + + if amd_usage: + stats["amd-vaapi"] = amd_usage + else: + stats["amd-vaapi"] = {"gpu": "", "mem": ""} + hwaccel_errors.append(args) elif "preset-rk" in args: rga_usage = get_rockchip_gpu_stats() diff --git a/frigate/test/test_gpu_stats.py b/frigate/test/test_gpu_stats.py index fd0df94c4..2604c4002 100644 --- a/frigate/test/test_gpu_stats.py +++ b/frigate/test/test_gpu_stats.py @@ -39,8 +39,12 @@ class TestGpuStats(unittest.TestCase): process.stdout = self.intel_results sp.return_value = process intel_stats = get_intel_gpu_stats(False) - print(f"the intel stats are {intel_stats}") + # rc6 values: 47.844741 and 100.0 → avg 73.92 → gpu = 100 - 73.92 = 26.08% + # Render/3D/0: 0.0 and 0.0 → enc = 0.0% + # Video/0: 4.533124 and 0.0 → dec = 2.27% assert intel_stats == { - "gpu": "1.13%", + "gpu": "26.08%", "mem": "-%", + "compute": "0.0%", + "dec": "2.27%", } diff --git a/frigate/util/services.py b/frigate/util/services.py index 8019f0092..f0bf2de1e 100644 --- a/frigate/util/services.py +++ b/frigate/util/services.py @@ -265,14 +265,30 @@ def get_amd_gpu_stats() -> Optional[dict[str, str]]: def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, str]]: - """Get stats using intel_gpu_top.""" + """Get stats using intel_gpu_top. + + Returns overall GPU usage derived from rc6 residency (idle time), + plus individual engine breakdowns: + - enc: Render/3D engine (compute/shader encoder, used by QSV) + - dec: Video engines (fixed-function codec, used by VAAPI) + """ def get_stats_manually(output: str) -> dict[str, str]: """Find global stats via regex when json fails to parse.""" reading = "".join(output) results: dict[str, str] = {} - # render is used for qsv + # rc6 residency for overall GPU usage + rc6_match = re.search(r'"rc6":\{"value":([\d.]+)', reading) + if rc6_match: + rc6_value = float(rc6_match.group(1)) + results["gpu"] = f"{round(100.0 - rc6_value, 2)}%" + else: + results["gpu"] = "-%" + + results["mem"] = "-%" + + # Render/3D is the compute/encode engine render = [] for result in re.findall(r'"Render/3D/0":{[a-z":\d.,%]+}', reading): packet = json.loads(result[14:]) @@ -280,11 +296,9 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, s render.append(float(single)) if render: - render_avg = sum(render) / len(render) - else: - render_avg = 1 + results["compute"] = f"{round(sum(render) / len(render), 2)}%" - # video is used for vaapi + # Video engines are the fixed-function decode engines video = [] for result in re.findall(r'"Video/\d":{[a-z":\d.,%]+}', reading): packet = json.loads(result[10:]) @@ -292,12 +306,8 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, s video.append(float(single)) if video: - video_avg = sum(video) / len(video) - else: - video_avg = 1 + results["dec"] = f"{round(sum(video) / len(video), 2)}%" - results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)}%" - results["mem"] = "-%" return results intel_gpu_top_command = [ @@ -336,10 +346,18 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, s return get_stats_manually(output) results: dict[str, str] = {} - render = {"global": []} - video = {"global": []} + rc6_values = [] + render_global = [] + video_global = [] + # per-client: {pid: [total_busy_per_sample, ...]} + client_usages: dict[str, list[float]] = {} for block in data: + # rc6 residency: percentage of time GPU is idle + rc6 = block.get("rc6", {}).get("value") + if rc6 is not None: + rc6_values.append(float(rc6)) + global_engine = block.get("engines") if global_engine: @@ -347,48 +365,53 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, s video_frame = global_engine.get("Video/0", {}).get("busy") if render_frame is not None: - render["global"].append(float(render_frame)) + render_global.append(float(render_frame)) if video_frame is not None: - video["global"].append(float(video_frame)) + video_global.append(float(video_frame)) clients = block.get("clients", {}) - if clients and len(clients): + if clients: for client_block in clients.values(): - key = client_block["pid"] + pid = client_block["pid"] - if render.get(key) is None: - render[key] = [] - video[key] = [] + if pid not in client_usages: + client_usages[pid] = [] - client_engine = client_block.get("engine-classes", {}) + # Sum all engine-class busy values for this client + total_busy = 0.0 + for engine in client_block.get("engine-classes", {}).values(): + busy = engine.get("busy") + if busy is not None: + total_busy += float(busy) - render_frame = client_engine.get("Render/3D", {}).get("busy") - video_frame = client_engine.get("Video", {}).get("busy") + client_usages[pid].append(total_busy) - if render_frame is not None: - render[key].append(float(render_frame)) + # Overall GPU usage from rc6 (idle) residency + if rc6_values: + rc6_avg = sum(rc6_values) / len(rc6_values) + results["gpu"] = f"{round(100.0 - rc6_avg, 2)}%" - if video_frame is not None: - video[key].append(float(video_frame)) + results["mem"] = "-%" - if render["global"] and video["global"]: - results["gpu"] = ( - f"{round(((sum(render['global']) / len(render['global'])) + (sum(video['global']) / len(video['global']))) / 2, 2)}%" - ) - results["mem"] = "-%" + # Compute: Render/3D engine (compute/shader workloads and QSV encode) + if render_global: + results["compute"] = f"{round(sum(render_global) / len(render_global), 2)}%" - if len(render.keys()) > 1: + # Decoder: Video engine (fixed-function codec) + if video_global: + results["dec"] = f"{round(sum(video_global) / len(video_global), 2)}%" + + # Per-client GPU usage (sum of all engines per process) + if client_usages: results["clients"] = {} - for key in render.keys(): - if key == "global" or not render[key] or not video[key]: - continue - - results["clients"][key] = ( - f"{round(((sum(render[key]) / len(render[key])) + (sum(video[key]) / len(video[key]))) / 2, 2)}%" - ) + for pid, samples in client_usages.items(): + if samples: + results["clients"][pid] = ( + f"{round(sum(samples) / len(samples), 2)}%" + ) return results diff --git a/web/public/locales/en/views/system.json b/web/public/locales/en/views/system.json index 0e3d6a35e..6c3f37f71 100644 --- a/web/public/locales/en/views/system.json +++ b/web/public/locales/en/views/system.json @@ -78,6 +78,7 @@ "gpuUsage": "GPU Usage", "gpuMemory": "GPU Memory", "gpuEncoder": "GPU Encoder", + "gpuCompute": "GPU Compute / Encode", "gpuDecoder": "GPU Decoder", "gpuTemperature": "GPU Temperature", "gpuInfo": { @@ -188,6 +189,7 @@ "cameraFfmpeg": "{{camName}} FFmpeg", "cameraCapture": "{{camName}} capture", "cameraDetect": "{{camName}} detect", + "cameraGpu": "{{camName}} GPU", "cameraFramesPerSecond": "{{camName}} frames per second", "cameraDetectionsPerSecond": "{{camName}} detections per second", "cameraSkippedDetectionsPerSecond": "{{camName}} skipped detections per second" diff --git a/web/src/components/Statusbar.tsx b/web/src/components/Statusbar.tsx index d1035dd60..18a0d9ee1 100644 --- a/web/src/components/Statusbar.tsx +++ b/web/src/components/Statusbar.tsx @@ -116,8 +116,7 @@ export default function Statusbar() { case "amd-vaapi": gpuTitle = "AMD GPU"; break; - case "intel-vaapi": - case "intel-qsv": + case "intel-gpu": gpuTitle = "Intel GPU"; break; case "rockchip": diff --git a/web/src/types/stats.ts b/web/src/types/stats.ts index 1046e0b47..c4b811185 100644 --- a/web/src/types/stats.ts +++ b/web/src/types/stats.ts @@ -60,8 +60,10 @@ export type GpuStats = { mem: string; enc?: string; dec?: string; + compute?: string; pstate?: string; temp?: number; + clients?: { [pid: string]: string }; }; export type NpuStats = { diff --git a/web/src/views/system/GeneralMetrics.tsx b/web/src/views/system/GeneralMetrics.tsx index 2bac7cd2b..fc7410b5a 100644 --- a/web/src/views/system/GeneralMetrics.tsx +++ b/web/src/views/system/GeneralMetrics.tsx @@ -76,7 +76,7 @@ export default function GeneralMetrics({ statsHistory.length > 0 && Object.keys(statsHistory[0]?.gpu_usages ?? {}).forEach((key) => { - if (key == "amd-vaapi" || key == "intel-vaapi" || key == "intel-qsv") { + if (key == "amd-vaapi" || key == "intel-gpu") { vaCount += 1; } @@ -265,7 +265,7 @@ export default function GeneralMetrics({ if ( Object.keys(statsHistory?.at(0)?.gpu_usages ?? {}).length == 1 && - Object.keys(statsHistory?.at(0)?.gpu_usages ?? {})[0].includes("intel") + Object.keys(statsHistory?.at(0)?.gpu_usages ?? {})[0] === "intel-gpu" ) { // intel gpu stats do not support memory return undefined; @@ -334,6 +334,43 @@ export default function GeneralMetrics({ return Object.keys(series).length > 0 ? Object.values(series) : undefined; }, [statsHistory]); + const gpuComputeSeries = useMemo(() => { + if (!statsHistory) { + return []; + } + + const series: { + [key: string]: { name: string; data: { x: number; y: string }[] }; + } = {}; + let hasValidGpu = false; + + statsHistory.forEach((stats, statsIdx) => { + if (!stats) { + return; + } + + Object.entries(stats.gpu_usages || {}).forEach(([key, stats]) => { + if (!(key in series)) { + series[key] = { name: key, data: [] }; + } + + if (stats.compute) { + hasValidGpu = true; + series[key].data.push({ + x: statsIdx + 1, + y: stats.compute.slice(0, -1), + }); + } + }); + }); + + if (!hasValidGpu) { + return []; + } + + return Object.keys(series).length > 0 ? Object.values(series) : undefined; + }, [statsHistory]); + const gpuDecSeries = useMemo(() => { if (!statsHistory) { return []; @@ -409,9 +446,7 @@ export default function GeneralMetrics({ } const gpuKeys = Object.keys(statsHistory[0]?.gpu_usages ?? {}); - const hasIntelGpu = gpuKeys.some( - (key) => key === "intel-vaapi" || key === "intel-qsv", - ); + const hasIntelGpu = gpuKeys.some((key) => key === "intel-gpu"); if (!hasIntelGpu) { return false; @@ -427,7 +462,7 @@ export default function GeneralMetrics({ } Object.entries(stats.gpu_usages || {}).forEach(([key, gpuStats]) => { - if (key === "intel-vaapi" || key === "intel-qsv") { + if (key === "intel-gpu") { if (gpuStats.gpu) { hasDataPoints = true; const gpuValue = parseFloat(gpuStats.gpu.slice(0, -1)); @@ -744,8 +779,9 @@ export default function GeneralMetrics({ className={cn( "mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2", gpuTempSeries?.length && "md:grid-cols-3", - gpuEncSeries?.length && "xl:grid-cols-4", - gpuEncSeries?.length && + (gpuEncSeries?.length || gpuComputeSeries?.length) && + "xl:grid-cols-4", + (gpuEncSeries?.length || gpuComputeSeries?.length) && gpuTempSeries?.length && "3xl:grid-cols-5", )} @@ -858,6 +894,30 @@ export default function GeneralMetrics({ ) : ( )} + {statsHistory.length != 0 ? ( + <> + {gpuComputeSeries && gpuComputeSeries?.length != 0 && ( +
+
+ {t("general.hardwareInfo.gpuCompute")} +
+ {gpuComputeSeries.map((series) => ( + + ))} +
+ )} + + ) : ( + + )} {statsHistory.length != 0 ? ( <> {gpuDecSeries && gpuDecSeries?.length != 0 && (