From 351bef936f9fbdd444cb56af6722891f7024028f Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:58:06 -0700
Subject: [PATCH] fix: omit gpu_mem_limit on CUDA query failure instead of
 guessing 4 GB

When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit
now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to
manage its own arena (grow-as-needed up to device capacity).

Tradeoff documented in the docstring:
- Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices
  (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken
  /dev/nvidia* container passthroughs, where requesting 4 GB caused
  cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs
  (24 GB RTX 3090 with 20 GB free), needlessly starving the session.
- The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are
  independent of the BFC arena cap, so dropping the cap on the failure path
  does not reintroduce the leak.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py | 13 ++++---
 frigate/test/test_detection_runners.py | 53 ++++++++++++++++++++++----
 frigate/util/model.py                  | 35 ++++++++++++-----
 3 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 922c7748e..09f88a116 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -636,8 +636,12 @@ def get_optimized_runner(
             )
         except Exception as e:
             logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
+                "CUDA graph capture failed for model_type=%s path=%s "
+                "device_id=%s providers=%s; falling back to standard ONNX runner: %s",
+                model_type,
                 model_path,
+                cuda_graph_options.get("device_id"),
+                providers,
                 e,
             )
 
@@ -651,10 +655,9 @@ def get_optimized_runner(
         options.pop(0)
 
     if providers and providers[0] == "CUDAExecutionProvider":
-        options[0] = {
-            **options[0],
-            "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
-        }
+        gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
+        if gpu_mem_limit is not None:
+            options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
 
     return ONNXModelRunner(
         ort.InferenceSession(
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index e8079f1c6..4f11d7afb 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -125,11 +125,12 @@ class TestComputeCudaMemLimit(unittest.TestCase):
         self.assertLessEqual(limit, int(total_vram * 0.80))
 
     @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
-    def test_fallback_on_cuda_unavailable(self, _mock_cdll):
+    def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
+        # See compute_cuda_mem_limit docstring for the tradeoff: returning a
+        # hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
         from frigate.util.model import compute_cuda_mem_limit
 
-        limit = compute_cuda_mem_limit("/fake/model.onnx")
-        self.assertEqual(limit, 4 * 1024**3)
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=50 * 1024 * 1024)
@@ -148,17 +149,17 @@ class TestComputeCudaMemLimit(unittest.TestCase):
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=200 * 1024 * 1024)
-    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+    def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
         # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
-        # producing gpu_mem_limit=0 and immediate session OOM.
+        # producing gpu_mem_limit=0 and immediate session OOM. We now return
+        # None so the caller omits gpu_mem_limit and ORT manages the arena.
         from frigate.util.model import compute_cuda_mem_limit
 
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
         mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
 
-        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
-        self.assertEqual(limit, 4 * 1024**3)
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=200 * 1024 * 1024)
@@ -341,6 +342,44 @@ class TestOrtLeakFixRegression(unittest.TestCase):
             )
 
 
+class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
+    """When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
+    inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_no_gpu_mem_limit_key_when_cuda_query_fails(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        mock_session.return_value.get_inputs.return_value = []
+        mock_session.return_value.get_outputs.return_value = []
+
+        get_optimized_runner(
+            "/fake/jina.onnx",
+            device="GPU",
+            model_type=EnrichmentModelTypeEnum.jina_v2.value,
+        )
+
+        provider_opts = mock_session.call_args.kwargs["provider_options"]
+        self.assertNotIn(
+            "gpu_mem_limit",
+            provider_opts[0],
+            "Must omit (not set to 0, not set to a guess) when query fails",
+        )
+
+
 class TestCudaGraphFallbackLogsException(unittest.TestCase):
     @patch("frigate.detectors.detection_runners.ort.InferenceSession")
     @patch(
diff --git a/frigate/util/model.py b/frigate/util/model.py
index 9867115a3..ac1cfe226 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -284,18 +284,33 @@ def post_process_yolox(
 ### ONNX Utilities
 
 
-def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
+def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
     """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
 
-    For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
+    For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
     requires all intermediate tensors to be live simultaneously, so peak GPU memory
-    is 15-20× the model file size and cannot be safely capped.  This function is
+    is 15-20x the model file size and cannot be safely capped.  This function is
     intended for embedding ONNXModelRunner sessions only.
 
     Returns a limit derived from:
-    - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
-    Falls back to 4 GB if the CUDA runtime query fails.
+    - min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
+
+    Returns None if the CUDA runtime query fails. The caller MUST then omit
+    gpu_mem_limit from provider_options so ORT falls back to its own default
+    (grow-as-needed up to device capacity).
+
+    Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
+    but that number is wrong for both ends of the spectrum:
+      - On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
+        any container where /dev/nvidia* passthrough is broken, asking for 4 GB
+        causes ORT session init to fail with cudaErrorMemoryAllocation.
+      - On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
+        starves the session and forces extra arena reallocations.
+    Returning None and letting ORT manage the arena itself is the
+    least-surprising behavior when we cannot actually measure VRAM. The
+    leak vectors this PR addresses (mem_pattern, mallopt) are independent
+    of the BFC arena cap, so dropping the cap on the failure path does
+    not reintroduce the leak.
     """
     try:
         libcudart = ctypes.CDLL("libcudart.so")
@@ -309,14 +324,14 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
         total = total_bytes.value
         free = free_bytes.value
     except Exception as e:
-        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
-        return 4 * 1024**3
+        logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
+        return None
 
     peak_multiplier = 14 if cuda_graph else 7
-    floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
+    desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
     # Honor free VRAM so co-resident embedding sessions (jina text + vision,
     # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
-    return min(floor, int(total * 0.80), int(free * 0.90))
+    return min(desired, int(total * 0.80), int(free * 0.90))
 
 
 def get_ort_providers(