Improve Intel Stats (#23190)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions

* Implement per intel-gpu stats collection

* Improve device naming

* Improve GPU vendor handling

* Cleanup
This commit is contained in:
Nicolas Mowen 2026-05-13 15:12:48 -06:00 committed by GitHub
parent c8cfb9400a
commit 78fc472026
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 214 additions and 60 deletions

View File

@ -0,0 +1,109 @@
"""Resolve human-readable names for Intel GPUs via OpenVINO."""
import logging
import re
from typing import Optional
logger = logging.getLogger(__name__)
class IntelGpuNameResolver:
"""Build a pdev -> normalized device name map by enumerating OpenVINO GPUs.
The lookup is performed once on first access and cached for the process
lifetime. OpenVINO exposes DEVICE_PCI_INFO (domain/bus/device/function) and
FULL_DEVICE_NAME for each GPU it can see, which is enough to associate the
name with the pdev string used by DRM fdinfo.
"""
_names: Optional[dict[str, str]] = None
def get_names(self) -> dict[str, str]:
if self._names is not None:
return self._names
names: dict[str, str] = {}
try:
from openvino import Core
except ImportError:
logger.debug("OpenVINO unavailable; cannot resolve Intel GPU names")
self._names = names
return names
try:
core = Core()
devices = core.available_devices
except Exception as exc:
logger.debug(f"OpenVINO Core initialization failed: {exc}")
self._names = names
return names
cpu_name: Optional[str] = None
if "CPU" in devices:
try:
cpu_name = self._strip_trademarks(
core.get_property("CPU", "FULL_DEVICE_NAME")
)
except Exception as exc:
logger.debug(f"Failed to read CPU FULL_DEVICE_NAME: {exc}")
for device in devices:
if not device.startswith("GPU"):
continue
try:
pci = core.get_property(device, "DEVICE_PCI_INFO")
raw_name = core.get_property(device, "FULL_DEVICE_NAME")
device_type = core.get_property(device, "DEVICE_TYPE")
except Exception as exc:
logger.debug(f"Failed to read properties for {device}: {exc}")
continue
pdev = self._format_pdev(pci)
if not pdev:
continue
names[pdev] = self._resolve_name(raw_name, device_type, cpu_name)
self._names = names
return names
@staticmethod
def _format_pdev(pci) -> Optional[str]:
try:
return f"{pci.domain:04x}:{pci.bus:02x}:{pci.device:02x}.{pci.function:x}"
except AttributeError:
return None
@classmethod
def _resolve_name(cls, raw_name: str, device_type, cpu_name: Optional[str]) -> str:
"""Build a display name for a GPU.
Modern integrated Intel GPUs are reported by OpenVINO with a generic
FULL_DEVICE_NAME like "Intel(R) Graphics (iGPU)" that gives no model
information. Since the iGPU is part of the CPU on these platforms, fall
back to the CPU name (which OpenVINO does report specifically) and
suffix it with "iGPU" so it's clear what the entry is.
"""
is_integrated = "INTEGRATED" in str(device_type).upper()
if is_integrated and cpu_name:
short_cpu = re.sub(r"^Intel\s+", "", cpu_name)
return f"{short_cpu} iGPU"
return cls._normalize_name(raw_name)
@classmethod
def _normalize_name(cls, name: str) -> str:
cleaned = cls._strip_trademarks(name)
cleaned = re.sub(r"\s*\((?:i|d)GPU\)\s*$", "", cleaned, flags=re.IGNORECASE)
return " ".join(cleaned.split())
@staticmethod
def _strip_trademarks(name: str) -> str:
cleaned = re.sub(r"\(R\)|\(TM\)", "", name)
return " ".join(cleaned.split())
intel_gpu_name_resolver = IntelGpuNameResolver()

View File

@ -230,6 +230,7 @@ async def set_gpu_stats(
hwaccel_args.append(args)
stats: dict[str, dict] = {}
intel_gpu_collected = False
for args in hwaccel_args:
if args in hwaccel_errors:
@ -242,6 +243,7 @@ async def set_gpu_stats(
if nvidia_usage:
for i in range(len(nvidia_usage)):
stats[nvidia_usage[i]["name"]] = {
"vendor": "nvidia",
"gpu": str(round(float(nvidia_usage[i]["gpu"]), 2)) + "%",
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
"enc": str(round(float(nvidia_usage[i]["enc"]), 2)) + "%",
@ -250,31 +252,34 @@ async def set_gpu_stats(
}
else:
stats["nvidia-gpu"] = {"gpu": "", "mem": ""}
stats["nvidia-gpu"] = {"vendor": "nvidia", "gpu": "", "mem": ""}
hwaccel_errors.append(args)
elif "nvmpi" in args or "jetson" in args:
# nvidia Jetson
jetson_usage = get_jetson_stats()
if jetson_usage:
stats["jetson-gpu"] = jetson_usage
stats["jetson-gpu"] = {"vendor": "nvidia", **jetson_usage}
else:
stats["jetson-gpu"] = {"gpu": "", "mem": ""}
stats["jetson-gpu"] = {"vendor": "nvidia", "gpu": "", "mem": ""}
hwaccel_errors.append(args)
elif "qsv" in args or ("vaapi" in args and not is_vaapi_amd_driver()):
if not config.telemetry.stats.intel_gpu_stats:
continue
if "intel-gpu" not in stats:
if not intel_gpu_collected:
# intel GPU (QSV or VAAPI both use the same physical GPU)
intel_gpu_collected = True
intel_usage = get_intel_gpu_stats(
config.telemetry.stats.intel_gpu_device
)
if intel_usage is not None:
stats["intel-gpu"] = intel_usage or {"gpu": "", "mem": ""}
if intel_usage:
for entry in intel_usage.values():
name = entry.pop("name")
stats[name] = entry
else:
stats["intel-gpu"] = {"gpu": "", "mem": ""}
stats["intel-gpu"] = {"vendor": "intel", "gpu": "", "mem": ""}
hwaccel_errors.append(args)
elif "vaapi" in args:
if not config.telemetry.stats.amd_gpu_stats:
@ -284,18 +289,18 @@ async def set_gpu_stats(
amd_usage = get_amd_gpu_stats()
if amd_usage:
stats["amd-vaapi"] = amd_usage
stats["amd-vaapi"] = {"vendor": "amd", **amd_usage}
else:
stats["amd-vaapi"] = {"gpu": "", "mem": ""}
stats["amd-vaapi"] = {"vendor": "amd", "gpu": "", "mem": ""}
hwaccel_errors.append(args)
elif "preset-rk" in args:
rga_usage = get_rockchip_gpu_stats()
if rga_usage:
stats["rockchip"] = rga_usage
stats["rockchip"] = {"vendor": "rockchip", **rga_usage}
elif "v4l2m2m" in args or "rpi" in args:
# RPi v4l2m2m is currently not able to get usage stats
stats["rpi-v4l2m2m"] = {"gpu": "", "mem": ""}
stats["rpi-v4l2m2m"] = {"vendor": "rpi", "gpu": "", "mem": ""}
if stats:
all_stats["gpu_usages"] = stats

View File

@ -17,12 +17,14 @@ class TestGpuStats(unittest.TestCase):
amd_stats = get_amd_gpu_stats()
assert amd_stats == {"gpu": "4.17%", "mem": "60.37%"}
@patch("frigate.stats.intel_gpu_info.intel_gpu_name_resolver.get_names")
@patch("frigate.util.services.time.sleep")
@patch("frigate.util.services.time.monotonic")
@patch("frigate.util.services._read_intel_drm_fdinfo")
def test_intel_gpu_stats_fdinfo(self, read_fdinfo, monotonic, sleep):
def test_intel_gpu_stats_fdinfo(self, read_fdinfo, monotonic, sleep, get_names):
# 1 second of wall clock between snapshots
monotonic.side_effect = [0.0, 1.0]
get_names.return_value = {"0000:00:02.0": "Intel Graphics"}
# Two i915 clients on the same iGPU. Engine values are cumulative ns.
# Deltas over the 1s window:
@ -79,11 +81,15 @@ class TestGpuStats(unittest.TestCase):
sleep.assert_called_once()
assert intel_stats == {
"gpu": "90.0%",
"mem": "-%",
"compute": "30.0%",
"dec": "60.0%",
"clients": {"100": "80.0%", "200": "10.0%"},
"0000:00:02.0": {
"name": "Intel Graphics",
"vendor": "intel",
"gpu": "90.0%",
"mem": "-%",
"compute": "30.0%",
"dec": "60.0%",
"clients": {"100": "80.0%", "200": "10.0%"},
},
}
@patch("frigate.util.services._read_intel_drm_fdinfo")

View File

@ -393,8 +393,10 @@ def _read_intel_drm_fdinfo(target_pdev: Optional[str]) -> dict:
return snapshot
def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, Any]]:
"""Get stats by reading DRM fdinfo files.
def get_intel_gpu_stats(
intel_gpu_device: Optional[str],
) -> Optional[dict[str, dict[str, Any]]]:
"""Get stats by reading DRM fdinfo files, bucketed per-pdev.
Each DRM client FD exposes monotonic per-engine busy counters via
/proc/<pid>/fdinfo/<fd> (i915 since kernel 5.19, Xe since first release).
@ -402,7 +404,14 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, A
utilization. Render/3D and Compute are pooled into "compute"; Video and
VideoEnhance into "dec". Overall "gpu" is the sum of those pools (clamped
to 100%).
The return value is keyed by the GPU's drm-pdev string so multiple Intel
GPUs in the same system are reported separately. Each entry carries a
"name" populated from OpenVINO (falling back to the pdev) so callers can
surface a real device name in the UI.
"""
from frigate.stats.intel_gpu_info import intel_gpu_name_resolver
target_pdev = _resolve_intel_gpu_pdev(intel_gpu_device)
snapshot_a = _read_intel_drm_fdinfo(target_pdev)
@ -417,19 +426,21 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, A
if not snapshot_b or elapsed_ns <= 0:
return None
engine_pct: dict[str, float] = {
"render": 0.0,
"video": 0.0,
"video-enhance": 0.0,
"compute": 0.0,
}
pid_pct: dict[str, float] = {}
def _new_engine_pct() -> dict[str, float]:
return {"render": 0.0, "video": 0.0, "video-enhance": 0.0, "compute": 0.0}
per_pdev_engine_pct: dict[str, dict[str, float]] = {}
per_pdev_pid_pct: dict[str, dict[str, float]] = {}
for key, data_b in snapshot_b.items():
data_a = snapshot_a.get(key)
if not data_a or data_a["driver"] != data_b["driver"]:
continue
pdev = key[0]
engine_pct = per_pdev_engine_pct.setdefault(pdev, _new_engine_pct())
pid_pct = per_pdev_pid_pct.setdefault(pdev, {})
client_total = 0.0
for engine, (busy_b, total_b) in data_b["engines"].items():
if engine not in engine_pct:
@ -452,25 +463,37 @@ def get_intel_gpu_stats(intel_gpu_device: Optional[str]) -> Optional[dict[str, A
pid_pct[data_b["pid"]] = pid_pct.get(data_b["pid"], 0.0) + client_total
for engine in engine_pct:
engine_pct[engine] = min(100.0, engine_pct[engine])
if not per_pdev_engine_pct:
return None
compute_pct = min(100.0, engine_pct["render"] + engine_pct["compute"])
dec_pct = min(100.0, engine_pct["video"] + engine_pct["video-enhance"])
overall_pct = min(100.0, compute_pct + dec_pct)
names = intel_gpu_name_resolver.get_names()
results: dict[str, dict[str, Any]] = {}
results: dict[str, Any] = {
"gpu": f"{round(overall_pct, 2)}%",
"mem": "-%",
"compute": f"{round(compute_pct, 2)}%",
"dec": f"{round(dec_pct, 2)}%",
}
for pdev, engine_pct in per_pdev_engine_pct.items():
for engine in engine_pct:
engine_pct[engine] = min(100.0, engine_pct[engine])
if pid_pct:
results["clients"] = {
pid: f"{round(min(100.0, pct), 2)}%" for pid, pct in pid_pct.items()
compute_pct = min(100.0, engine_pct["render"] + engine_pct["compute"])
dec_pct = min(100.0, engine_pct["video"] + engine_pct["video-enhance"])
overall_pct = min(100.0, compute_pct + dec_pct)
entry: dict[str, Any] = {
"name": names.get(pdev) or f"Intel GPU {pdev}",
"vendor": "intel",
"gpu": f"{round(overall_pct, 2)}%",
"mem": "-%",
"compute": f"{round(compute_pct, 2)}%",
"dec": f"{round(dec_pct, 2)}%",
}
pid_pct = per_pdev_pid_pct.get(pdev)
if pid_pct:
entry["clients"] = {
pid: f"{round(min(100.0, pct), 2)}%" for pid, pct in pid_pct.items()
}
results[pdev] = entry
return results

View File

@ -62,7 +62,10 @@ export type ExtraProcessStats = {
mem?: string;
};
export type GpuVendor = "intel" | "amd" | "nvidia" | "rockchip" | "rpi";
export type GpuStats = {
vendor?: GpuVendor;
gpu: string;
mem: string;
enc?: string;

View File

@ -1,5 +1,5 @@
import useSWR from "swr";
import { FrigateStats, GpuInfo } from "@/types/stats";
import { FrigateStats, GpuInfo, GpuStats } from "@/types/stats";
import { startTransition, useEffect, useMemo, useState } from "react";
import { useFrigateStats } from "@/api/ws";
import {
@ -98,13 +98,11 @@ export default function GeneralMetrics({
let nvCount = 0;
statsHistory.length > 0 &&
Object.keys(statsHistory[0]?.gpu_usages ?? {}).forEach((key) => {
if (key == "amd-vaapi" || key == "intel-gpu") {
vaCount += 1;
}
if (key.includes("NVIDIA")) {
Object.values(statsHistory[0]?.gpu_usages ?? {}).forEach((stats) => {
if (stats.vendor === "nvidia") {
nvCount += 1;
} else if (stats.vendor === "intel" || stats.vendor === "amd") {
vaCount += 1;
}
});
@ -288,11 +286,15 @@ export default function GeneralMetrics({
return [];
}
// Intel doesn't expose VRAM usage, so hide the memory section
// entirely when every reporting GPU is Intel.
const firstEntries: GpuStats[] = Object.values(
statsHistory[0]?.gpu_usages ?? {},
);
if (
Object.keys(statsHistory?.at(0)?.gpu_usages ?? {}).length == 1 &&
Object.keys(statsHistory?.at(0)?.gpu_usages ?? {})[0] === "intel-gpu"
firstEntries.length > 0 &&
firstEntries.every((s) => s.vendor === "intel")
) {
// intel gpu stats do not support memory
return undefined;
}
@ -307,6 +309,10 @@ export default function GeneralMetrics({
}
Object.entries(stats.gpu_usages || {}).forEach(([key, stats]) => {
if (stats.vendor === "intel") {
return;
}
if (!(key in series)) {
series[key] = { name: key, data: [] };
}
@ -470,8 +476,9 @@ export default function GeneralMetrics({
return false;
}
const gpuKeys = Object.keys(statsHistory[0]?.gpu_usages ?? {});
const hasIntelGpu = gpuKeys.some((key) => key === "intel-gpu");
const hasIntelGpu = Object.values(statsHistory[0]?.gpu_usages ?? {}).some(
(stats) => stats.vendor === "intel",
);
if (!hasIntelGpu) {
return false;
@ -486,14 +493,15 @@ export default function GeneralMetrics({
continue;
}
Object.entries(stats.gpu_usages || {}).forEach(([key, gpuStats]) => {
if (key === "intel-gpu") {
if (gpuStats.gpu) {
hasDataPoints = true;
const gpuValue = parseFloat(gpuStats.gpu.slice(0, -1));
if (!isNaN(gpuValue) && gpuValue > 0) {
allZero = false;
}
Object.values(stats.gpu_usages || {}).forEach((gpuStats) => {
if (gpuStats.vendor !== "intel") {
return;
}
if (gpuStats.gpu) {
hasDataPoints = true;
const gpuValue = parseFloat(gpuStats.gpu.slice(0, -1));
if (!isNaN(gpuValue) && gpuValue > 0) {
allZero = false;
}
}
});