From 1717f21f69df6f809f0cd8ceaca871be912446e2 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:44:18 -0700
Subject: [PATCH] fix: handle CUDA query failure and free-VRAM contention in
 gpu_mem_limit

Targeted follow-ups to the embeddings_manager ORT leak fix that affect all
GPU-resident embedding models (Jina text+vision, PaddleOCR det+rec, ArcFace,
YOLOv9 license plate). Detection-side YOLO runners are unaffected since
CudaGraphRunner does not call compute_cuda_mem_limit.

- compute_cuda_mem_limit now checks the cudaMemGetInfo return code instead
  of trusting that a non-throwing call populated the buffers. Previously a
  non-zero rc left both pointers at 0, producing gpu_mem_limit=0 and
  immediate session OOM rather than the documented 4 GB fallback.
- The limit also factors in currently-free VRAM (free * 0.9), not just
  total. On a shared GPU where co-resident embedding sessions have already
  consumed most of the device, capping at 80% of total still over-allocates.
- The CUDA graph fallback path now logs the underlying exception text so
  failures (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.)
  stop being swallowed by the bare except.

Tests cover all three regression paths plus updated existing tests that
now require cudaMemGetInfo to return cudaSuccess explicitly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py |   5 +-
 frigate/test/test_detection_runners.py | 108 ++++++++++++++++++++-----
 frigate/util/model.py                  |  18 +++--
 3 files changed, 103 insertions(+), 28 deletions(-)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 277b1c542..e397d73fb 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -625,10 +625,11 @@ def get_optimized_runner(
                 ),
                 cuda_graph_options["device_id"],
             )
-        except Exception:
+        except Exception as e:
             logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
                 model_path,
+                e,
             )
 
     if (
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index c914b9bdc..e8079f1c6 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase):
 
 
 class TestComputeCudaMemLimit(unittest.TestCase):
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
-    def test_respects_ceiling(self, mock_getsize, mock_cdll):
-        """gpu_mem_limit must not exceed 80% of total VRAM."""
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_respects_ceiling(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
-        total_vram = 24 * 1024**3  # 24 GB
+        total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertLessEqual(limit, int(total_vram * 0.80))
 
     @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
     def test_fallback_on_cuda_unavailable(self, _mock_cdll):
-        """Falls back to 4 GB when CUDA runtime is not available."""
         from frigate.util.model import compute_cuda_mem_limit
 
         limit = compute_cuda_mem_limit("/fake/model.onnx")
         self.assertEqual(limit, 4 * 1024**3)
 
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
-    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
-        """Floor must be at least 2 GB regardless of model size."""
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)
+    def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
         total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertGreaterEqual(limit, 2 * 1024**3)
 
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+        # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
+        # producing gpu_mem_limit=0 and immediate session OOM.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertEqual(limit, 4 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
+        # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
+        # not 80% of total — co-resident embedding sessions would OOM otherwise.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            3 * 1024**3, 24 * 1024**3
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
+
 
 class TestOrtLeakFixRegression(unittest.TestCase):
     """Regression guards for the embeddings_manager ORT memory leak fix.
@@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase):
             )
 
 
+class TestCudaGraphFallbackLogsException(unittest.TestCase):
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_exception_text(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Concern #1: the bare `except Exception:` swallowed the underlying
+        # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
+        # turning a debuggable failure into an opaque "fell back to ONNX runner".
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("cudaErrorStreamCaptureUnsupported"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn("CUDA graph capture failed", joined)
+        self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/frigate/util/model.py b/frigate/util/model.py
index d0b8721cc..9867115a3 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
 
     Returns a limit derived from:
     - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: 80% of total GPU VRAM
+    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
     Falls back to 4 GB if the CUDA runtime query fails.
     """
     try:
         libcudart = ctypes.CDLL("libcudart.so")
         free_bytes = ctypes.c_size_t()
         total_bytes = ctypes.c_size_t()
-        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        rc = libcudart.cudaMemGetInfo(
+            ctypes.byref(free_bytes), ctypes.byref(total_bytes)
+        )
+        if rc != 0 or total_bytes.value == 0:
+            raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
         total = total_bytes.value
-    except Exception:
-        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+        free = free_bytes.value
+    except Exception as e:
+        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
         return 4 * 1024**3
 
     peak_multiplier = 14 if cuda_graph else 7
     floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
-    ceiling = int(total * 0.80)
-    return min(floor, ceiling)
+    # Honor free VRAM so co-resident embedding sessions (jina text + vision,
+    # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
+    return min(floor, int(total * 0.80), int(free * 0.90))
 
 
 def get_ort_providers(