diff --git a/frigate/stats.py b/frigate/stats.py index 1ceb5f30d..203dfc46f 100644 --- a/frigate/stats.py +++ b/frigate/stats.py @@ -151,9 +151,11 @@ async def set_gpu_stats( nvidia_usage = get_nvidia_gpu_stats() if nvidia_usage: - name = nvidia_usage["name"] - del nvidia_usage["name"] - stats[name] = nvidia_usage + for i in nvidia_usage: + stats[nvidia_usage[i]["name"]] = { + "gpu": round(nvidia_usage[i]["gpu"],2), + "mem": round(nvidia_usage[i]["mem"],2), + } else: stats["nvidia-gpu"] = {"gpu": -1, "mem": -1} hwaccel_errors.append(args) diff --git a/frigate/util.py b/frigate/util.py index b26e28c9f..0b6178f66 100755 --- a/frigate/util.py +++ b/frigate/util.py @@ -16,6 +16,7 @@ from collections import Counter from collections.abc import Mapping from multiprocessing import shared_memory from typing import Any, AnyStr, Optional, Tuple +import py3nvml.py3nvml as nvml import cv2 import numpy as np @@ -915,46 +916,37 @@ def get_intel_gpu_stats() -> dict[str, str]: return results +def try_get_info(f, h, default='N/A'): + try: + v = f(h) + except nvml.NVMLError_NotSupported: + v = default + return v + + + def get_nvidia_gpu_stats() -> dict[str, str]: - """Get stats using nvidia-smi.""" - nvidia_smi_command = [ - "nvidia-smi", - "--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total", - "--format=csv", - ] - - if ( - "CUDA_VISIBLE_DEVICES" in os.environ - and os.environ["CUDA_VISIBLE_DEVICES"].isdigit() - ): - nvidia_smi_command.extend(["--id", os.environ["CUDA_VISIBLE_DEVICES"]]) - elif ( - "NVIDIA_VISIBLE_DEVICES" in os.environ - and os.environ["NVIDIA_VISIBLE_DEVICES"].isdigit() - ): - nvidia_smi_command.extend(["--id", os.environ["NVIDIA_VISIBLE_DEVICES"]]) - - p = sp.run( - nvidia_smi_command, - encoding="ascii", - capture_output=True, - ) - - if p.returncode != 0: - logger.error(f"Unable to poll nvidia GPU stats: {p.stderr}") - return None - else: - usages = p.stdout.split("\n")[1].strip().split(",") - memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %" - results: dict[str, str] = { - "name": usages[0], - "gpu": usages[1].strip(), - "mem": memory_percent, + nvml.nvmlInit() + deviceCount = nvml.nvmlDeviceGetCount() + results = {} + for i in range(deviceCount): + handle = nvml.nvmlDeviceGetHandleByIndex(i) + meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) + util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) + if util != 'N/A': + gpu_util = util.gpu + else: + gpu_util = 0 + results[i] = { + "name": nvml.nvmlDeviceGetName(handle), + "gpu": gpu_util, + "mem": meminfo.used / meminfo.total * 100 } return results + def ffprobe_stream(path: str) -> sp.CompletedProcess: """Run ffprobe on stream.""" clean_path = escape_special_characters(path) diff --git a/requirements-wheels.txt b/requirements-wheels.txt index e8e92408b..95d70077a 100644 --- a/requirements-wheels.txt +++ b/requirements-wheels.txt @@ -11,6 +11,7 @@ peewee == 3.15.* peewee_migrate == 1.7.* psutil == 5.9.* pydantic == 1.10.* +git+https://github.com/fbcotter/py3nvml#egg=py3nvml PyYAML == 6.0 pytz == 2023.3 tzlocal == 4.3 diff --git a/web/src/routes/System.jsx b/web/src/routes/System.jsx index 0320e237b..579fa8a85 100644 --- a/web/src/routes/System.jsx +++ b/web/src/routes/System.jsx @@ -268,8 +268,8 @@ export default function System() {