From 351bef936f9fbdd444cb56af6722891f7024028f Mon Sep 17 00:00:00 2001 From: felalex Date: Sat, 2 May 2026 23:58:06 -0700 Subject: [PATCH] fix: omit gpu_mem_limit on CUDA query failure instead of guessing 4 GB When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to manage its own arena (grow-as-needed up to device capacity). Tradeoff documented in the docstring: - Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken /dev/nvidia* container passthroughs, where requesting 4 GB caused cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs (24 GB RTX 3090 with 20 GB free), needlessly starving the session. - The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are independent of the BFC arena cap, so dropping the cap on the failure path does not reintroduce the leak. Co-Authored-By: Claude Opus 4.7 --- frigate/detectors/detection_runners.py | 13 ++++--- frigate/test/test_detection_runners.py | 53 ++++++++++++++++++++++---- frigate/util/model.py | 35 ++++++++++++----- 3 files changed, 79 insertions(+), 22 deletions(-) diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py index 922c7748e..09f88a116 100644 --- a/frigate/detectors/detection_runners.py +++ b/frigate/detectors/detection_runners.py @@ -636,8 +636,12 @@ def get_optimized_runner( ) except Exception as e: logger.warning( - "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s", + "CUDA graph capture failed for model_type=%s path=%s " + "device_id=%s providers=%s; falling back to standard ONNX runner: %s", + model_type, model_path, + cuda_graph_options.get("device_id"), + providers, e, ) @@ -651,10 +655,9 @@ def get_optimized_runner( options.pop(0) if providers and providers[0] == "CUDAExecutionProvider": - options[0] = { - **options[0], - "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False), - } + gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False) + if gpu_mem_limit is not None: + options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit} return ONNXModelRunner( ort.InferenceSession( diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py index e8079f1c6..4f11d7afb 100644 --- a/frigate/test/test_detection_runners.py +++ b/frigate/test/test_detection_runners.py @@ -125,11 +125,12 @@ class TestComputeCudaMemLimit(unittest.TestCase): self.assertLessEqual(limit, int(total_vram * 0.80)) @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) - def test_fallback_on_cuda_unavailable(self, _mock_cdll): + def test_returns_none_when_cuda_unavailable(self, _mock_cdll): + # See compute_cuda_mem_limit docstring for the tradeoff: returning a + # hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620). from frigate.util.model import compute_cuda_mem_limit - limit = compute_cuda_mem_limit("/fake/model.onnx") - self.assertEqual(limit, 4 * 1024**3) + self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx")) @patch("frigate.util.model.ctypes.CDLL") @patch("os.path.getsize", return_value=50 * 1024 * 1024) @@ -148,17 +149,17 @@ class TestComputeCudaMemLimit(unittest.TestCase): @patch("frigate.util.model.ctypes.CDLL") @patch("os.path.getsize", return_value=200 * 1024 * 1024) - def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll): + def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll): # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0, - # producing gpu_mem_limit=0 and immediate session OOM. + # producing gpu_mem_limit=0 and immediate session OOM. We now return + # None so the caller omits gpu_mem_limit and ORT manages the arena. from frigate.util.model import compute_cuda_mem_limit mock_lib = MagicMock() mock_cdll.return_value = mock_lib mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation - limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) - self.assertEqual(limit, 4 * 1024**3) + self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)) @patch("frigate.util.model.ctypes.CDLL") @patch("os.path.getsize", return_value=200 * 1024 * 1024) @@ -341,6 +342,44 @@ class TestOrtLeakFixRegression(unittest.TestCase): ) +class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase): + """When compute_cuda_mem_limit returns None, get_optimized_runner must NOT + inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place.""" + + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_no_gpu_mem_limit_key_when_cuda_query_fails( + self, _gs, _cdll, _rknn, _gp, mock_session + ): + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + mock_session.return_value.get_inputs.return_value = [] + mock_session.return_value.get_outputs.return_value = [] + + get_optimized_runner( + "/fake/jina.onnx", + device="GPU", + model_type=EnrichmentModelTypeEnum.jina_v2.value, + ) + + provider_opts = mock_session.call_args.kwargs["provider_options"] + self.assertNotIn( + "gpu_mem_limit", + provider_opts[0], + "Must omit (not set to 0, not set to a guess) when query fails", + ) + + class TestCudaGraphFallbackLogsException(unittest.TestCase): @patch("frigate.detectors.detection_runners.ort.InferenceSession") @patch( diff --git a/frigate/util/model.py b/frigate/util/model.py index 9867115a3..ac1cfe226 100644 --- a/frigate/util/model.py +++ b/frigate/util/model.py @@ -284,18 +284,33 @@ def post_process_yolox( ### ONNX Utilities -def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int: +def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None: """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena. - For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture + For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture requires all intermediate tensors to be live simultaneously, so peak GPU memory - is 15-20× the model file size and cannot be safely capped. This function is + is 15-20x the model file size and cannot be safely capped. This function is intended for embedding ONNXModelRunner sessions only. Returns a limit derived from: - - Floor: model file size × peak_multiplier (≥ 2 GB) - - Ceiling: min(80% of total VRAM, 90% of currently free VRAM) - Falls back to 4 GB if the CUDA runtime query fails. + - min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM) + + Returns None if the CUDA runtime query fails. The caller MUST then omit + gpu_mem_limit from provider_options so ORT falls back to its own default + (grow-as-needed up to device capacity). + + Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here, + but that number is wrong for both ends of the spectrum: + - On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and + any container where /dev/nvidia* passthrough is broken, asking for 4 GB + causes ORT session init to fail with cudaErrorMemoryAllocation. + - On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly + starves the session and forces extra arena reallocations. + Returning None and letting ORT manage the arena itself is the + least-surprising behavior when we cannot actually measure VRAM. The + leak vectors this PR addresses (mem_pattern, mallopt) are independent + of the BFC arena cap, so dropping the cap on the failure path does + not reintroduce the leak. """ try: libcudart = ctypes.CDLL("libcudart.so") @@ -309,14 +324,14 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int: total = total_bytes.value free = free_bytes.value except Exception as e: - logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e) - return 4 * 1024**3 + logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e) + return None peak_multiplier = 14 if cuda_graph else 7 - floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3) + desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3) # Honor free VRAM so co-resident embedding sessions (jina text + vision, # paddleocr det + rec, arcface) don't OOM each other on shared GPUs. - return min(floor, int(total * 0.80), int(free * 0.90)) + return min(desired, int(total * 0.80), int(free * 0.90)) def get_ort_providers(