fix: handle CUDA query failure and free-VRAM contention in gpu_mem_limit

Targeted follow-ups to the embeddings_manager ORT leak fix that affect all GPU-resident embedding models (Jina text+vision, PaddleOCR det+rec, ArcFace, YOLOv9 license plate). Detection-side YOLO runners are unaffected since CudaGraphRunner does not call compute_cuda_mem_limit. - compute_cuda_mem_limit now checks the cudaMemGetInfo return code instead of trusting that a non-throwing call populated the buffers. Previously a non-zero rc left both pointers at 0, producing gpu_mem_limit=0 and immediate session OOM rather than the documented 4 GB fallback. - The limit also factors in currently-free VRAM (free * 0.9), not just total. On a shared GPU where co-resident embedding sessions have already consumed most of the device, capping at 80% of total still over-allocates. - The CUDA graph fallback path now logs the underlying exception text so failures (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.) stop being swallowed by the bare except. Tests cover all three regression paths plus updated existing tests that now require cudaMemGetInfo to return cudaSuccess explicitly. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-03 02:21:13 +03:00 · 2026-05-02 23:44:18 -07:00 · 2026-05-02 23:44:18 -07:00 · 1717f21f69
commit 1717f21f69
parent 71060805f0
3 changed files with 103 additions and 28 deletions
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@ -625,10 +625,11 @@ def get_optimized_runner(
                ),
                cuda_graph_options["device_id"],
            )
-        except Exception:
+        except Exception as e:
            logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
                model_path,
                e,
            )
    if (
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase):
 class TestComputeCudaMemLimit(unittest.TestCase):
    @staticmethod
    def _fake_mem_get_info(free_value: int, total_value: int):
        def _impl(free_ptr, total_ptr):
            free_ptr._obj.value = free_value
            total_ptr._obj.value = total_value
            return 0  # cudaSuccess
        return _impl
    @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
-    def test_respects_ceiling(self, mock_getsize, mock_cdll):
+    def test_respects_ceiling(self, _mock_getsize, mock_cdll):
        """gpu_mem_limit must not exceed 80% of total VRAM."""
        from frigate.util.model import compute_cuda_mem_limit
-        total_vram = 24 * 1024**3  # 24 GB
+        total_vram = 24 * 1024**3
        mock_lib = MagicMock()
        mock_cdll.return_value = mock_lib
-
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
-        def fake_mem_get_info(free_ptr, total_ptr):
+            total_vram, total_vram
-            total_ptr._obj.value = total_vram
+        )
            free_ptr._obj.value = total_vram
        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
        self.assertLessEqual(limit, int(total_vram * 0.80))
    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
    def test_fallback_on_cuda_unavailable(self, _mock_cdll):
        """Falls back to 4 GB when CUDA runtime is not available."""
        from frigate.util.model import compute_cuda_mem_limit
        limit = compute_cuda_mem_limit("/fake/model.onnx")
        self.assertEqual(limit, 4 * 1024**3)
    @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)
-    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
+    def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
        """Floor must be at least 2 GB regardless of model size."""
        from frigate.util.model import compute_cuda_mem_limit
        total_vram = 24 * 1024**3
        mock_lib = MagicMock()
        mock_cdll.return_value = mock_lib
-
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
-        def fake_mem_get_info(free_ptr, total_ptr):
+            total_vram, total_vram
-            total_ptr._obj.value = total_vram
+        )
            free_ptr._obj.value = total_vram
        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
        self.assertGreaterEqual(limit, 2 * 1024**3)
    @patch("frigate.util.model.ctypes.CDLL")
    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
        # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
        # producing gpu_mem_limit=0 and immediate session OOM.
        from frigate.util.model import compute_cuda_mem_limit
        mock_lib = MagicMock()
        mock_cdll.return_value = mock_lib
        mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
        self.assertEqual(limit, 4 * 1024**3)
    @patch("frigate.util.model.ctypes.CDLL")
    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
    def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
        # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
        # not 80% of total — co-resident embedding sessions would OOM otherwise.
        from frigate.util.model import compute_cuda_mem_limit
        mock_lib = MagicMock()
        mock_cdll.return_value = mock_lib
        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
            3 * 1024**3, 24 * 1024**3
        )
        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
        self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
 class TestOrtLeakFixRegression(unittest.TestCase):
    """Regression guards for the embeddings_manager ORT memory leak fix.
@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase):
            )
 class TestCudaGraphFallbackLogsException(unittest.TestCase):
    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
    @patch(
        "frigate.detectors.detection_runners.get_ort_providers",
        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
    )
    @patch(
        "frigate.detectors.detection_runners.is_rknn_compatible",
        return_value=False,
    )
    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
    def test_fallback_warning_includes_exception_text(
        self, _gs, _cdll, _rknn, _gp, mock_session
    ):
        # Concern #1: the bare `except Exception:` swallowed the underlying
        # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
        # turning a debuggable failure into an opaque "fell back to ONNX runner".
        from frigate.detectors.detection_runners import get_optimized_runner
        from frigate.detectors.detector_config import ModelTypeEnum
        mock_session.side_effect = [
            RuntimeError("cudaErrorStreamCaptureUnsupported"),
            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
        ]
        with self.assertLogs(
            "frigate.detectors.detection_runners", level="WARNING"
        ) as captured:
            get_optimized_runner(
                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
            )
        joined = "\n".join(captured.output)
        self.assertIn("CUDA graph capture failed", joined)
        self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
 if __name__ == "__main__":
    unittest.main()
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
    Returns a limit derived from:
    - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: 80% of total GPU VRAM
+    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
    Falls back to 4 GB if the CUDA runtime query fails.
    """
    try:
        libcudart = ctypes.CDLL("libcudart.so")
        free_bytes = ctypes.c_size_t()
        total_bytes = ctypes.c_size_t()
-        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        rc = libcudart.cudaMemGetInfo(
            ctypes.byref(free_bytes), ctypes.byref(total_bytes)
        )
        if rc != 0 or total_bytes.value == 0:
            raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
        total = total_bytes.value
-    except Exception:
+        free = free_bytes.value
-        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+    except Exception as e:
        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
        return 4 * 1024**3
    peak_multiplier = 14 if cuda_graph else 7
    floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
-    ceiling = int(total * 0.80)
+    # Honor free VRAM so co-resident embedding sessions (jina text + vision,
-    return min(floor, ceiling)
+    # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
    return min(floor, int(total * 0.80), int(free * 0.90))
 def get_ort_providers(