From ef9d7e07b70b65bc6752acdb2677b55fe6a6662f Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Mon, 4 May 2026 09:36:32 -0600 Subject: [PATCH] Rewrite intel stats (#23108) * Rewrite intel GPU stats to use file descriptors instead of intel_gpu_top, leading to significantly better API for interaction and more accurate results * Update tests * Update docs * Adjust approach * Update strings --- .../hardware_acceleration_video.md | 78 +---- frigate/config/telemetry.py | 4 +- frigate/test/test_gpu_stats.py | 100 ++++-- frigate/util/services.py | 308 +++++++++++------- web/public/locales/en/config/cameras.json | 4 + web/public/locales/en/config/global.json | 8 +- 6 files changed, 276 insertions(+), 226 deletions(-) diff --git a/docs/docs/configuration/hardware_acceleration_video.md b/docs/docs/configuration/hardware_acceleration_video.md index 7aeecfda95..617d735395 100644 --- a/docs/docs/configuration/hardware_acceleration_video.md +++ b/docs/docs/configuration/hardware_acceleration_video.md @@ -136,90 +136,32 @@ ffmpeg: -### Configuring Intel GPU Stats in Docker +### Configuring Intel GPU Stats -Additional configuration is needed for the Docker container to be able to access the `intel_gpu_top` command for GPU stats. There are two options: +Frigate reads Intel GPU utilization directly from the kernel's per-client DRM usage counters exposed at `/proc//fdinfo/`. This requires: -1. Run the container as privileged. -2. Add the `CAP_PERFMON` capability (note: you might need to set the `perf_event_paranoid` low enough to allow access to the performance event system.) +- Linux kernel **5.19 or newer** for the `i915` driver, or any release of the `xe` driver. +- Frigate running with permission to read other processes' fdinfo. Running as root inside the container (the default) satisfies this; non-root setups may need `CAP_SYS_PTRACE`. -#### Run as privileged +No `intel_gpu_top` binary, `CAP_PERFMON`, privileged mode, or `perf_event_paranoid` tuning is required. -This method works, but it gives more permissions to the container than are actually needed. +#### Stats for SR-IOV or specific devices -##### Docker Compose - Privileged - -```yaml -services: - frigate: - ... - image: ghcr.io/blakeblackshear/frigate:stable - # highlight-next-line - privileged: true -``` - -##### Docker Run CLI - Privileged - -```bash {4} -docker run -d \ - --name frigate \ - ... - --privileged \ - ghcr.io/blakeblackshear/frigate:stable -``` - -#### CAP_PERFMON - -Only recent versions of Docker support the `CAP_PERFMON` capability. You can test to see if yours supports it by running: `docker run --cap-add=CAP_PERFMON hello-world` - -##### Docker Compose - CAP_PERFMON - -```yaml {5,6} -services: - frigate: - ... - image: ghcr.io/blakeblackshear/frigate:stable - cap_add: - - CAP_PERFMON -``` - -##### Docker Run CLI - CAP_PERFMON - -```bash {4} -docker run -d \ - --name frigate \ - ... - --cap-add=CAP_PERFMON \ - ghcr.io/blakeblackshear/frigate:stable -``` - -#### perf_event_paranoid - -_Note: This setting must be changed for the entire system._ - -For more information on the various values across different distributions, see https://askubuntu.com/questions/1400874/what-does-perf-paranoia-level-four-do. - -Depending on your OS and kernel configuration, you may need to change the `/proc/sys/kernel/perf_event_paranoid` kernel tunable. You can test the change by running `sudo sh -c 'echo 2 >/proc/sys/kernel/perf_event_paranoid'` which will persist until a reboot. Make it permanent by running `sudo sh -c 'echo kernel.perf_event_paranoid=2 >> /etc/sysctl.d/local.conf'` - -#### Stats for SR-IOV or other devices - -When using virtualized GPUs via SR-IOV, you need to specify the device path to use to gather stats from `intel_gpu_top`. This example may work for some systems using SR-IOV: +If the host has more than one Intel GPU (e.g. an iGPU plus a discrete GPU, or SR-IOV virtual functions), pin stats collection to a specific device by setting `intel_gpu_device` to either its PCI bus address or a DRM card/render-node path: ```yaml telemetry: stats: - intel_gpu_device: "sriov" + intel_gpu_device: "0000:00:02.0" ``` -For other virtualized GPUs, try specifying the direct path to the device instead: - ```yaml telemetry: stats: - intel_gpu_device: "drm:/dev/dri/card0" + intel_gpu_device: "/dev/dri/card1" ``` -If you are passing in a device path, make sure you've passed the device through to the container. +When passing a device path, make sure the device is also passed through to the container. ## AMD-based CPUs diff --git a/frigate/config/telemetry.py b/frigate/config/telemetry.py index 41c3f7bbc2..f85ff343f3 100644 --- a/frigate/config/telemetry.py +++ b/frigate/config/telemetry.py @@ -25,8 +25,8 @@ class StatsConfig(FrigateBaseModel): ) intel_gpu_device: Optional[str] = Field( default=None, - title="SR-IOV device", - description="Device identifier used when treating Intel GPUs as SR-IOV to fix GPU stats.", + title="Intel GPU device", + description="PCI bus address or DRM device path (e.g. /dev/dri/card1) used to pin Intel GPU stats to a specific device when multiple are present.", ) diff --git a/frigate/test/test_gpu_stats.py b/frigate/test/test_gpu_stats.py index 2604c4002c..85b12138d5 100644 --- a/frigate/test/test_gpu_stats.py +++ b/frigate/test/test_gpu_stats.py @@ -7,8 +7,6 @@ from frigate.util.services import get_amd_gpu_stats, get_intel_gpu_stats class TestGpuStats(unittest.TestCase): def setUp(self): self.amd_results = "Unknown Radeon card. <= R500 won't work, new cards might.\nDumping to -, line limit 1.\n1664070990.607556: bus 10, gpu 4.17%, ee 0.00%, vgt 0.00%, ta 0.00%, tc 0.00%, sx 0.00%, sh 0.00%, spi 0.83%, smx 0.00%, cr 0.00%, sc 0.00%, pa 0.00%, db 0.00%, cb 0.00%, vram 60.37% 294.04mb, gtt 0.33% 52.21mb, mclk 100.00% 1.800ghz, sclk 26.65% 0.533ghz\n" - self.intel_results = """{"period":{"duration":1.194033,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":3349.991164,"unit":"irq/s"},"rc6":{"value":47.844741,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":4.533124,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":6.194385,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}},{"period":{"duration":1.189291,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":0.000000,"unit":"irq/s"},"rc6":{"value":100.000000,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}}""" - self.nvidia_results = "name, utilization.gpu [%], memory.used [MiB], memory.total [MiB]\nNVIDIA GeForce RTX 3050, 42 %, 5036 MiB, 8192 MiB\n" @patch("subprocess.run") def test_amd_gpu_stats(self, sp): @@ -19,32 +17,76 @@ class TestGpuStats(unittest.TestCase): amd_stats = get_amd_gpu_stats() assert amd_stats == {"gpu": "4.17%", "mem": "60.37%"} - # @patch("subprocess.run") - # def test_nvidia_gpu_stats(self, sp): - # process = MagicMock() - # process.returncode = 0 - # process.stdout = self.nvidia_results - # sp.return_value = process - # nvidia_stats = get_nvidia_gpu_stats() - # assert nvidia_stats == { - # "name": "NVIDIA GeForce RTX 3050", - # "gpu": "42 %", - # "mem": "61.5 %", - # } + @patch("frigate.util.services.time.sleep") + @patch("frigate.util.services.time.monotonic") + @patch("frigate.util.services._read_intel_drm_fdinfo") + def test_intel_gpu_stats_fdinfo(self, read_fdinfo, monotonic, sleep): + # 1 second of wall clock between snapshots + monotonic.side_effect = [0.0, 1.0] - @patch("subprocess.run") - def test_intel_gpu_stats(self, sp): - process = MagicMock() - process.returncode = 124 - process.stdout = self.intel_results - sp.return_value = process - intel_stats = get_intel_gpu_stats(False) - # rc6 values: 47.844741 and 100.0 → avg 73.92 → gpu = 100 - 73.92 = 26.08% - # Render/3D/0: 0.0 and 0.0 → enc = 0.0% - # Video/0: 4.533124 and 0.0 → dec = 2.27% - assert intel_stats == { - "gpu": "26.08%", - "mem": "-%", - "compute": "0.0%", - "dec": "2.27%", + # Two i915 clients on the same iGPU. Engine values are cumulative ns. + # Deltas over the 1s window: + # client A (pid 100): render +200_000_000 (20%), video +500_000_000 (50%), + # video-enhance +100_000_000 (10%) + # client B (pid 200): compute +100_000_000 (10%) + # Engine totals → render 20, video 50, video-enhance 10, compute 10 + # → compute = render + compute = 30 + # → dec = video + video-enhance = 60 + # → gpu = compute + dec = 90 + snapshot_a = { + ("0000:00:02.0", "1", "100"): { + "driver": "i915", + "pid": "100", + "engines": { + "render": (1_000_000_000, 0), + "video": (5_000_000_000, 0), + "video-enhance": (200_000_000, 0), + "compute": (0, 0), + }, + }, + ("0000:00:02.0", "2", "200"): { + "driver": "i915", + "pid": "200", + "engines": { + "render": (0, 0), + "compute": (2_000_000_000, 0), + }, + }, } + snapshot_b = { + ("0000:00:02.0", "1", "100"): { + "driver": "i915", + "pid": "100", + "engines": { + "render": (1_200_000_000, 0), + "video": (5_500_000_000, 0), + "video-enhance": (300_000_000, 0), + "compute": (0, 0), + }, + }, + ("0000:00:02.0", "2", "200"): { + "driver": "i915", + "pid": "200", + "engines": { + "render": (0, 0), + "compute": (2_100_000_000, 0), + }, + }, + } + read_fdinfo.side_effect = [snapshot_a, snapshot_b] + + intel_stats = get_intel_gpu_stats(None) + + sleep.assert_called_once() + assert intel_stats == { + "gpu": "90.0%", + "mem": "-%", + "compute": "30.0%", + "dec": "60.0%", + "clients": {"100": "80.0%", "200": "10.0%"}, + } + + @patch("frigate.util.services._read_intel_drm_fdinfo") + def test_intel_gpu_stats_no_clients(self, read_fdinfo): + read_fdinfo.return_value = {} + assert get_intel_gpu_stats(None) is None diff --git a/frigate/util/services.py b/frigate/util/services.py index 159b9b6834..657cf6d552 100644 --- a/frigate/util/services.py +++ b/frigate/util/services.py @@ -264,156 +264,214 @@ def get_amd_gpu_stats() -> Optional[dict[str, str]]: return results -def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, str]]: - """Get stats using intel_gpu_top. +_INTEL_FDINFO_SAMPLE_SECONDS = 1.0 - Returns overall GPU usage derived from rc6 residency (idle time), - plus individual engine breakdowns: - - enc: Render/3D engine (compute/shader encoder, used by QSV) - - dec: Video engines (fixed-function codec, used by VAAPI) +# Engines we track. Render/3D and Compute are pooled into "compute"; Video and +# VideoEnhance into "dec" (VideoEnhance is the post-process engine that handles +# VAAPI scaling/deinterlace/CSC, e.g. ffmpeg `-vf scale_vaapi=...`). The Copy +# (DMA blitter) engine is intentionally ignored — it represents transparent +# memory transfers, not user-visible GPU work. +# i915 fdinfo keys (cumulative ns) → logical engine name. +_I915_ENGINE_KEYS = { + "drm-engine-render": "render", + "drm-engine-video": "video", + "drm-engine-video-enhance": "video-enhance", + "drm-engine-compute": "compute", +} +# Xe fdinfo suffixes (cumulative cycles, paired with drm-total-cycles-*). +_XE_ENGINE_KEYS = { + "rcs": "render", + "vcs": "video", + "vecs": "video-enhance", + "ccs": "compute", +} + + +def _resolve_intel_gpu_pdev(device: Optional[str]) -> Optional[str]: + """Map a configured GPU hint (/dev/dri/card1, renderD128, or a PCI bus + address) to its drm-pdev string so we can filter fdinfo entries to that + device. Returns None when no hint is supplied or it cannot be resolved.""" + if not device: + return None + + if re.match(r"^[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]$", device): + return device + + name = os.path.basename(device.rstrip("/")) + try: + return os.path.basename(os.path.realpath(f"/sys/class/drm/{name}/device")) + except OSError: + return None + + +def _read_intel_drm_fdinfo(target_pdev: Optional[str]) -> dict: + """Snapshot DRM fdinfo for every Intel client visible in /proc. + + Returns a dict keyed by (pdev, drm-client-id, pid) so the same context + seen via multiple file descriptors on a single process collapses to one + entry. """ - - def get_stats_manually(output: str) -> dict[str, str]: - """Find global stats via regex when json fails to parse.""" - reading = "".join(output) - results: dict[str, str] = {} - - # rc6 residency for overall GPU usage - rc6_match = re.search(r'"rc6":\{"value":([\d.]+)', reading) - if rc6_match: - rc6_value = float(rc6_match.group(1)) - results["gpu"] = f"{round(100.0 - rc6_value, 2)}%" - else: - results["gpu"] = "-%" - - results["mem"] = "-%" - - # Render/3D is the compute/encode engine - render = [] - for result in re.findall(r'"Render/3D/0":{[a-z":\d.,%]+}', reading): - packet = json.loads(result[14:]) - single = packet.get("busy", 0.0) - render.append(float(single)) - - if render: - results["compute"] = f"{round(sum(render) / len(render), 2)}%" - - # Video engines are the fixed-function decode engines - video = [] - for result in re.findall(r'"Video/\d":{[a-z":\d.,%]+}', reading): - packet = json.loads(result[10:]) - single = packet.get("busy", 0.0) - video.append(float(single)) - - if video: - results["dec"] = f"{round(sum(video) / len(video), 2)}%" - - return results - - intel_gpu_top_command = [ - "timeout", - "0.5s", - "intel_gpu_top", - "-J", - "-o", - "-", - "-s", - "1000", # Intel changed this from seconds to milliseconds in 2024+ versions - ] - - if intel_gpu_device: - intel_gpu_top_command += ["-d", intel_gpu_device] + snapshot: dict = {} try: - p = sp.run( - intel_gpu_top_command, - encoding="ascii", - capture_output=True, - ) - except UnicodeDecodeError: - return None + proc_entries = os.listdir("/proc") + except OSError: + return snapshot - # timeout has a non-zero returncode when timeout is reached - if p.returncode != 124: - logger.error(f"Unable to poll intel GPU stats: {p.stderr}") - return None - else: - output = "".join(p.stdout.split()) + for entry in proc_entries: + if not entry.isdigit(): + continue + fdinfo_dir = f"/proc/{entry}/fdinfo" try: - data = json.loads(f"[{output}]") - except json.JSONDecodeError: - return get_stats_manually(output) + fds = os.listdir(fdinfo_dir) + except (FileNotFoundError, PermissionError, NotADirectoryError, OSError): + continue - results: dict[str, str] = {} - rc6_values = [] - render_global = [] - video_global = [] - # per-client: {pid: [total_busy_per_sample, ...]} - client_usages: dict[str, list[float]] = {} + for fd in fds: + try: + with open(f"{fdinfo_dir}/{fd}") as f: + content = f.read() + except (FileNotFoundError, PermissionError, OSError): + continue - for block in data: - # rc6 residency: percentage of time GPU is idle - rc6 = block.get("rc6", {}).get("value") - if rc6 is not None: - rc6_values.append(float(rc6)) + if "drm-driver" not in content: + continue - global_engine = block.get("engines") + fields: dict[str, str] = {} + for line in content.splitlines(): + key, sep, value = line.partition(":") + if sep: + fields[key.strip()] = value.strip() - if global_engine: - render_frame = global_engine.get("Render/3D/0", {}).get("busy") - video_frame = global_engine.get("Video/0", {}).get("busy") + driver = fields.get("drm-driver") + if driver not in ("i915", "xe"): + continue - if render_frame is not None: - render_global.append(float(render_frame)) + pdev = fields.get("drm-pdev", "") + if target_pdev and pdev != target_pdev: + continue - if video_frame is not None: - video_global.append(float(video_frame)) + client_id = fields.get("drm-client-id") + if not client_id: + continue - clients = block.get("clients", {}) + key = (pdev, client_id, entry) + if key in snapshot: + continue - if clients: - for client_block in clients.values(): - pid = client_block["pid"] + engines: dict[str, tuple[int, int]] = {} - if pid not in client_usages: - client_usages[pid] = [] + if driver == "i915": + for fkey, engine in _I915_ENGINE_KEYS.items(): + raw = fields.get(fkey) + if not raw: + continue + try: + engines[engine] = (int(raw.split()[0]), 0) + except (ValueError, IndexError): + continue + else: + for suffix, engine in _XE_ENGINE_KEYS.items(): + busy_raw = fields.get(f"drm-cycles-{suffix}") + total_raw = fields.get(f"drm-total-cycles-{suffix}") + if not (busy_raw and total_raw): + continue + try: + engines[engine] = ( + int(busy_raw.split()[0]), + int(total_raw.split()[0]), + ) + except (ValueError, IndexError): + continue - # Sum all engine-class busy values for this client - total_busy = 0.0 - for engine in client_block.get("engine-classes", {}).values(): - busy = engine.get("busy") - if busy is not None: - total_busy += float(busy) + if not engines: + continue - client_usages[pid].append(total_busy) + snapshot[key] = {"driver": driver, "pid": entry, "engines": engines} - # Overall GPU usage from rc6 (idle) residency - if rc6_values: - rc6_avg = sum(rc6_values) / len(rc6_values) - results["gpu"] = f"{round(100.0 - rc6_avg, 2)}%" + return snapshot - results["mem"] = "-%" - # Compute: Render/3D engine (compute/shader workloads and QSV encode) - if render_global: - results["compute"] = f"{round(sum(render_global) / len(render_global), 2)}%" +def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, Any]]: + """Get stats by reading DRM fdinfo files. - # Decoder: Video engine (fixed-function codec) - if video_global: - results["dec"] = f"{round(sum(video_global) / len(video_global), 2)}%" + Each DRM client FD exposes monotonic per-engine busy counters via + /proc//fdinfo/ (i915 since kernel 5.19, Xe since first release). + We sample twice and divide busy-time deltas by wall-clock to derive + utilization. Render/3D and Compute are pooled into "compute"; Video and + VideoEnhance into "dec". Overall "gpu" is the sum of those pools (clamped + to 100%). + """ + target_pdev = _resolve_intel_gpu_pdev(intel_gpu_device) - # Per-client GPU usage (sum of all engines per process) - if client_usages: - results["clients"] = {} + snapshot_a = _read_intel_drm_fdinfo(target_pdev) + if not snapshot_a: + return None - for pid, samples in client_usages.items(): - if samples: - results["clients"][pid] = ( - f"{round(sum(samples) / len(samples), 2)}%" - ) + start = time.monotonic() + time.sleep(_INTEL_FDINFO_SAMPLE_SECONDS) + elapsed_ns = (time.monotonic() - start) * 1e9 - return results + snapshot_b = _read_intel_drm_fdinfo(target_pdev) + if not snapshot_b or elapsed_ns <= 0: + return None + + engine_pct: dict[str, float] = { + "render": 0.0, + "video": 0.0, + "video-enhance": 0.0, + "compute": 0.0, + } + pid_pct: dict[str, float] = {} + + for key, data_b in snapshot_b.items(): + data_a = snapshot_a.get(key) + if not data_a or data_a["driver"] != data_b["driver"]: + continue + + client_total = 0.0 + for engine, (busy_b, total_b) in data_b["engines"].items(): + if engine not in engine_pct: + continue + + busy_a, total_a = data_a["engines"].get(engine, (busy_b, total_b)) + + if data_b["driver"] == "i915": + delta = max(0, busy_b - busy_a) + pct = min(100.0, delta / elapsed_ns * 100.0) + else: + delta_busy = max(0, busy_b - busy_a) + delta_total = total_b - total_a + if delta_total <= 0: + continue + pct = min(100.0, delta_busy / delta_total * 100.0) + + engine_pct[engine] += pct + client_total += pct + + pid_pct[data_b["pid"]] = pid_pct.get(data_b["pid"], 0.0) + client_total + + for engine in engine_pct: + engine_pct[engine] = min(100.0, engine_pct[engine]) + + compute_pct = min(100.0, engine_pct["render"] + engine_pct["compute"]) + dec_pct = min(100.0, engine_pct["video"] + engine_pct["video-enhance"]) + overall_pct = min(100.0, compute_pct + dec_pct) + + results: dict[str, Any] = { + "gpu": f"{round(overall_pct, 2)}%", + "mem": "-%", + "compute": f"{round(compute_pct, 2)}%", + "dec": f"{round(dec_pct, 2)}%", + } + + if pid_pct: + results["clients"] = { + pid: f"{round(min(100.0, pct), 2)}%" for pid, pct in pid_pct.items() + } + + return results def get_openvino_npu_stats() -> Optional[dict[str, str]]: diff --git a/web/public/locales/en/config/cameras.json b/web/public/locales/en/config/cameras.json index 1b524c347d..9320159f4c 100644 --- a/web/public/locales/en/config/cameras.json +++ b/web/public/locales/en/config/cameras.json @@ -485,6 +485,10 @@ "hwaccel_args": { "label": "Export hwaccel args", "description": "Hardware acceleration args to use for export/transcode operations." + }, + "max_concurrent": { + "label": "Maximum concurrent exports", + "description": "Maximum number of export jobs to process at the same time." } }, "preview": { diff --git a/web/public/locales/en/config/global.json b/web/public/locales/en/config/global.json index 69c77fad11..b2df826527 100644 --- a/web/public/locales/en/config/global.json +++ b/web/public/locales/en/config/global.json @@ -242,8 +242,8 @@ "description": "Enable per-process network bandwidth monitoring for camera ffmpeg processes and detectors (requires capabilities)." }, "intel_gpu_device": { - "label": "SR-IOV device", - "description": "Device identifier used when treating Intel GPUs as SR-IOV to fix GPU stats." + "label": "Intel GPU device", + "description": "PCI bus address or DRM device path (e.g. /dev/dri/card1) used to pin Intel GPU stats to a specific device when multiple are present." } }, "version_check": { @@ -1000,6 +1000,10 @@ "hwaccel_args": { "label": "Export hwaccel args", "description": "Hardware acceleration args to use for export/transcode operations." + }, + "max_concurrent": { + "label": "Maximum concurrent exports", + "description": "Maximum number of export jobs to process at the same time." } }, "preview": {