mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-01-22 20:18:30 +03:00
Add support for GPU and NPU temperatures (#21495)
* Add rockchip temps * Add support for GPU and NPU temperatures in the frontend * Add support for Nvidia temperature * Improve separation * Adjust graph scaling
This commit is contained in:
parent
7fb8d9b050
commit
aa0b082184
@ -123,6 +123,10 @@ def get_detector_temperature(
|
|||||||
if index < len(hailo_device_names):
|
if index < len(hailo_device_names):
|
||||||
device_name = hailo_device_names[index]
|
device_name = hailo_device_names[index]
|
||||||
return hailo_temps[device_name]
|
return hailo_temps[device_name]
|
||||||
|
elif detector_type == "rknn":
|
||||||
|
# Rockchip temperatures are handled by the GPU / NPU stats
|
||||||
|
# as there are not detector specific temperatures
|
||||||
|
pass
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -242,6 +246,7 @@ async def set_gpu_stats(
|
|||||||
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
|
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
|
||||||
"enc": str(round(float(nvidia_usage[i]["enc"]), 2)) + "%",
|
"enc": str(round(float(nvidia_usage[i]["enc"]), 2)) + "%",
|
||||||
"dec": str(round(float(nvidia_usage[i]["dec"]), 2)) + "%",
|
"dec": str(round(float(nvidia_usage[i]["dec"]), 2)) + "%",
|
||||||
|
"temp": str(nvidia_usage[i]["temp"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -417,12 +417,12 @@ def get_openvino_npu_stats() -> Optional[dict[str, str]]:
|
|||||||
else:
|
else:
|
||||||
usage = 0.0
|
usage = 0.0
|
||||||
|
|
||||||
return {"npu": f"{round(usage, 2)}", "mem": "-"}
|
return {"npu": f"{round(usage, 2)}", "mem": "-%"}
|
||||||
except (FileNotFoundError, PermissionError, ValueError):
|
except (FileNotFoundError, PermissionError, ValueError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
|
def get_rockchip_gpu_stats() -> Optional[dict[str, str | float]]:
|
||||||
"""Get GPU stats using rk."""
|
"""Get GPU stats using rk."""
|
||||||
try:
|
try:
|
||||||
with open("/sys/kernel/debug/rkrga/load", "r") as f:
|
with open("/sys/kernel/debug/rkrga/load", "r") as f:
|
||||||
@ -440,7 +440,16 @@ def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
average_load = f"{round(sum(load_values) / len(load_values), 2)}%"
|
average_load = f"{round(sum(load_values) / len(load_values), 2)}%"
|
||||||
return {"gpu": average_load, "mem": "-"}
|
stats: dict[str, str | float] = {"gpu": average_load, "mem": "-%"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("/sys/class/thermal/thermal_zone5/temp", "r") as f:
|
||||||
|
line = f.readline().strip()
|
||||||
|
stats["temp"] = round(int(line) / 1000, 1)
|
||||||
|
except (FileNotFoundError, OSError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
|
def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
|
||||||
@ -463,13 +472,25 @@ def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
|
|||||||
|
|
||||||
percentages = [int(load) for load in core_loads]
|
percentages = [int(load) for load in core_loads]
|
||||||
mean = round(sum(percentages) / len(percentages), 2)
|
mean = round(sum(percentages) / len(percentages), 2)
|
||||||
return {"npu": mean, "mem": "-"}
|
stats: dict[str, float | str] = {"npu": mean, "mem": "-%"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("/sys/class/thermal/thermal_zone6/temp", "r") as f:
|
||||||
|
line = f.readline().strip()
|
||||||
|
stats["temp"] = round(int(line) / 1000, 1)
|
||||||
|
except (FileNotFoundError, OSError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
def try_get_info(f, h, default="N/A"):
|
def try_get_info(f, h, default="N/A", sensor=None):
|
||||||
try:
|
try:
|
||||||
if h:
|
if h:
|
||||||
v = f(h)
|
if sensor is not None:
|
||||||
|
v = f(h, sensor)
|
||||||
|
else:
|
||||||
|
v = f(h)
|
||||||
else:
|
else:
|
||||||
v = f()
|
v = f()
|
||||||
except nvml.NVMLError_NotSupported:
|
except nvml.NVMLError_NotSupported:
|
||||||
@ -498,6 +519,9 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
|||||||
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
|
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
|
||||||
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
|
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
|
||||||
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
|
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
|
||||||
|
temp = try_get_info(
|
||||||
|
nvml.nvmlDeviceGetTemperature, handle, default=None, sensor=0
|
||||||
|
)
|
||||||
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
|
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
|
||||||
|
|
||||||
if util != "N/A":
|
if util != "N/A":
|
||||||
@ -510,6 +534,11 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
|||||||
else:
|
else:
|
||||||
gpu_mem_util = -1
|
gpu_mem_util = -1
|
||||||
|
|
||||||
|
if temp != "N/A" and temp is not None:
|
||||||
|
temp = float(temp)
|
||||||
|
else:
|
||||||
|
temp = None
|
||||||
|
|
||||||
if enc != "N/A":
|
if enc != "N/A":
|
||||||
enc_util = enc[0]
|
enc_util = enc[0]
|
||||||
else:
|
else:
|
||||||
@ -527,6 +556,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
|||||||
"enc": enc_util,
|
"enc": enc_util,
|
||||||
"dec": dec_util,
|
"dec": dec_util,
|
||||||
"pstate": pstate or "unknown",
|
"pstate": pstate or "unknown",
|
||||||
|
"temp": temp,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|||||||
@ -51,6 +51,7 @@
|
|||||||
"gpuMemory": "GPU Memory",
|
"gpuMemory": "GPU Memory",
|
||||||
"gpuEncoder": "GPU Encoder",
|
"gpuEncoder": "GPU Encoder",
|
||||||
"gpuDecoder": "GPU Decoder",
|
"gpuDecoder": "GPU Decoder",
|
||||||
|
"gpuTemperature": "GPU Temperature",
|
||||||
"gpuInfo": {
|
"gpuInfo": {
|
||||||
"vainfoOutput": {
|
"vainfoOutput": {
|
||||||
"title": "Vainfo Output",
|
"title": "Vainfo Output",
|
||||||
@ -77,6 +78,7 @@
|
|||||||
},
|
},
|
||||||
"npuUsage": "NPU Usage",
|
"npuUsage": "NPU Usage",
|
||||||
"npuMemory": "NPU Memory",
|
"npuMemory": "NPU Memory",
|
||||||
|
"npuTemperature": "NPU Temperature",
|
||||||
"intelGpuWarning": {
|
"intelGpuWarning": {
|
||||||
"title": "Intel GPU Stats Warning",
|
"title": "Intel GPU Stats Warning",
|
||||||
"message": "GPU stats unavailable",
|
"message": "GPU stats unavailable",
|
||||||
|
|||||||
@ -61,11 +61,13 @@ export type GpuStats = {
|
|||||||
enc?: string;
|
enc?: string;
|
||||||
dec?: string;
|
dec?: string;
|
||||||
pstate?: string;
|
pstate?: string;
|
||||||
|
temp?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type NpuStats = {
|
export type NpuStats = {
|
||||||
npu: number;
|
npu: number;
|
||||||
mem: string;
|
mem: string;
|
||||||
|
temp?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type GpuInfo = "vainfo" | "nvinfo";
|
export type GpuInfo = "vainfo" | "nvinfo";
|
||||||
|
|||||||
@ -368,6 +368,40 @@ export default function GeneralMetrics({
|
|||||||
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||||
}, [statsHistory]);
|
}, [statsHistory]);
|
||||||
|
|
||||||
|
const gpuTempSeries = useMemo(() => {
|
||||||
|
if (!statsHistory) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const series: {
|
||||||
|
[key: string]: { name: string; data: { x: number; y: number }[] };
|
||||||
|
} = {};
|
||||||
|
let hasValidGpu = false;
|
||||||
|
|
||||||
|
statsHistory.forEach((stats, statsIdx) => {
|
||||||
|
if (!stats) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Object.entries(stats.gpu_usages || {}).forEach(([key, stats]) => {
|
||||||
|
if (!(key in series)) {
|
||||||
|
series[key] = { name: key, data: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stats.temp !== undefined) {
|
||||||
|
hasValidGpu = true;
|
||||||
|
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!hasValidGpu) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||||
|
}, [statsHistory]);
|
||||||
|
|
||||||
// Check if Intel GPU has all 0% usage values (known bug)
|
// Check if Intel GPU has all 0% usage values (known bug)
|
||||||
const showIntelGpuWarning = useMemo(() => {
|
const showIntelGpuWarning = useMemo(() => {
|
||||||
if (!statsHistory || statsHistory.length < 3) {
|
if (!statsHistory || statsHistory.length < 3) {
|
||||||
@ -448,6 +482,40 @@ export default function GeneralMetrics({
|
|||||||
return Object.keys(series).length > 0 ? Object.values(series) : [];
|
return Object.keys(series).length > 0 ? Object.values(series) : [];
|
||||||
}, [statsHistory]);
|
}, [statsHistory]);
|
||||||
|
|
||||||
|
const npuTempSeries = useMemo(() => {
|
||||||
|
if (!statsHistory) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const series: {
|
||||||
|
[key: string]: { name: string; data: { x: number; y: number }[] };
|
||||||
|
} = {};
|
||||||
|
let hasValidNpu = false;
|
||||||
|
|
||||||
|
statsHistory.forEach((stats, statsIdx) => {
|
||||||
|
if (!stats) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Object.entries(stats.npu_usages || {}).forEach(([key, stats]) => {
|
||||||
|
if (!(key in series)) {
|
||||||
|
series[key] = { name: key, data: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stats.temp !== undefined) {
|
||||||
|
hasValidNpu = true;
|
||||||
|
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!hasValidNpu) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||||
|
}, [statsHistory]);
|
||||||
|
|
||||||
// other processes stats
|
// other processes stats
|
||||||
|
|
||||||
const hardwareType = useMemo(() => {
|
const hardwareType = useMemo(() => {
|
||||||
@ -669,7 +737,11 @@ export default function GeneralMetrics({
|
|||||||
<div
|
<div
|
||||||
className={cn(
|
className={cn(
|
||||||
"mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2",
|
"mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2",
|
||||||
gpuEncSeries?.length && "md:grid-cols-4",
|
gpuTempSeries?.length && "md:grid-cols-3",
|
||||||
|
gpuEncSeries?.length && "xl:grid-cols-4",
|
||||||
|
gpuEncSeries?.length &&
|
||||||
|
gpuTempSeries?.length &&
|
||||||
|
"3xl:grid-cols-5",
|
||||||
)}
|
)}
|
||||||
>
|
>
|
||||||
{statsHistory[0]?.gpu_usages && (
|
{statsHistory[0]?.gpu_usages && (
|
||||||
@ -804,6 +876,30 @@ export default function GeneralMetrics({
|
|||||||
) : (
|
) : (
|
||||||
<Skeleton className="aspect-video w-full" />
|
<Skeleton className="aspect-video w-full" />
|
||||||
)}
|
)}
|
||||||
|
{statsHistory.length != 0 ? (
|
||||||
|
<>
|
||||||
|
{gpuTempSeries && gpuTempSeries?.length != 0 && (
|
||||||
|
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
|
||||||
|
<div className="mb-5">
|
||||||
|
{t("general.hardwareInfo.gpuTemperature")}
|
||||||
|
</div>
|
||||||
|
{gpuTempSeries.map((series) => (
|
||||||
|
<ThresholdBarGraph
|
||||||
|
key={series.name}
|
||||||
|
graphId={`${series.name}-temp`}
|
||||||
|
name={series.name}
|
||||||
|
unit="°C"
|
||||||
|
threshold={DetectorTempThreshold}
|
||||||
|
updateTimes={updateTimes}
|
||||||
|
data={[series]}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<Skeleton className="aspect-video w-full" />
|
||||||
|
)}
|
||||||
|
|
||||||
{statsHistory[0]?.npu_usages && (
|
{statsHistory[0]?.npu_usages && (
|
||||||
<>
|
<>
|
||||||
@ -827,6 +923,30 @@ export default function GeneralMetrics({
|
|||||||
) : (
|
) : (
|
||||||
<Skeleton className="aspect-video w-full" />
|
<Skeleton className="aspect-video w-full" />
|
||||||
)}
|
)}
|
||||||
|
{statsHistory.length != 0 ? (
|
||||||
|
<>
|
||||||
|
{npuTempSeries && npuTempSeries?.length != 0 && (
|
||||||
|
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
|
||||||
|
<div className="mb-5">
|
||||||
|
{t("general.hardwareInfo.npuTemperature")}
|
||||||
|
</div>
|
||||||
|
{npuTempSeries.map((series) => (
|
||||||
|
<ThresholdBarGraph
|
||||||
|
key={series.name}
|
||||||
|
graphId={`${series.name}-temp`}
|
||||||
|
name={series.name}
|
||||||
|
unit="°C"
|
||||||
|
threshold={DetectorTempThreshold}
|
||||||
|
updateTimes={updateTimes}
|
||||||
|
data={[series]}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<Skeleton className="aspect-video w-full" />
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</>
|
</>
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user