From 71060805f05d53c5b0ef885dd22269c574699719 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 07:57:35 -0700
Subject: [PATCH 1/8] fix: prevent embeddings_manager ORT memory leak (arena +
 mmap plan + glibc)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three independent ORT/glibc leak vectors identified and fixed:

1. **ORT CPU BFC arena** (`enable_cpu_mem_arena=False` for all sessions)
   ORT's default CPU arena pools host-side GPU↔CPU staging buffers indefinitely.
   Disabling it across every InferenceSession (detection + embedding) stops
   hundreds-of-MB/h RSS growth seen on systems with CUDA EP sessions.

2. **ORT memory-pattern cache** (`enable_mem_pattern=False` for variable-length models)
   For embedding models with variable-length inputs (Jina v1/v2, PaddleOCR),
   ORT allocates one mmap-backed execution plan per unique sequence length and
   never frees them. Disabling the pattern cache stops this unbounded anon-mmap
   growth. Fixed-size models (YOLO) keep `enable_mem_pattern=True` to preserve
   buffer aliasing and avoid CUDA graph capture failures.

3. **mallopt(M_ARENA_MAX)** called from `EmbeddingProcess.run()`
   The forkserver start method exec()s a fresh Python interpreter that does not
   inherit Docker env vars, so `MALLOC_ARENA_MAX` set in docker-compose never
   reaches the child. Calling `mallopt(-8, os.cpu_count())` from `run()` caps
   glibc malloc arenas in the child process.

Additional improvements:
- `compute_cuda_mem_limit()`: dynamically caps the ORT CUDA EP BFC arena for
  embedding sessions to min(model_size × 7, 80% VRAM); prevents OOM on
  multi-model systems while leaving headroom for detection sessions.
- CUDA graph capture is now wrapped in try/except so models with CPU-only ops
  (e.g. attention, NMS) fall back to ONNXModelRunner instead of crashing.
- `ONNXModelRunner.has_variable_length_inputs()`: centralises the
  Jina/PaddleOCR detection logic to keep SessionOptions creation consistent.
- 17 regression-guard unit tests in `frigate/test/test_detection_runners.py`
  that will fail if any of these three fixes is accidentally reverted.

Fixes: #23007

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py |  83 +++++--
 frigate/embeddings/__init__.py         |  11 +
 frigate/test/test_detection_runners.py | 315 +++++++++++++++++++++++++
 frigate/util/model.py                  |  30 +++
 4 files changed, 420 insertions(+), 19 deletions(-)
 create mode 100644 frigate/test/test_detection_runners.py

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index d12c8b733..277b1c542 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -10,7 +10,7 @@ from typing import Any
 import numpy as np
 import onnxruntime as ort
 
-from frigate.util.model import get_ort_providers
+from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
 from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
 
 logger = logging.getLogger(__name__)
@@ -24,23 +24,36 @@ def is_arm64_platform() -> bool:
 
 def get_ort_session_options(
     is_complex_model: bool = False,
-) -> ort.SessionOptions | None:
+    variable_length_inputs: bool = False,
+) -> ort.SessionOptions:
     """Get ONNX Runtime session options with appropriate settings.
 
     Args:
         is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
+        variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
+            embeddings).  When True, disables memory-pattern caching, which otherwise builds
+            a plan per unique input shape and holds onto mmap regions indefinitely — a major
+            source of RSS growth in the embeddings_manager process.
 
     Returns:
-        SessionOptions with appropriate optimization level, or None for default settings.
+        SessionOptions with appropriate settings.
     """
+    sess_options = ort.SessionOptions()
+    # Disable the CPU BFC arena for all sessions.  With the arena enabled ORT pools
+    # host-side staging buffers for GPU↔CPU transfers and never releases them back to
+    # the OS, causing RSS to grow without bound in long-running embedding processes.
+    sess_options.enable_cpu_mem_arena = False
+    if variable_length_inputs:
+        # Disable per-shape memory-layout plan caching for models with variable-length
+        # inputs (Jina CLIP text, PaddleOCR).  Each unique sequence length creates a
+        # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
+        # Fixed-size models (YOLO at 640×640) should keep this enabled for buffer aliasing.
+        sess_options.enable_mem_pattern = False
     if is_complex_model:
-        sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = (
             ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
         )
-        return sess_options
-
-    return None
+    return sess_options
 
 
 # Import OpenVINO only when needed to avoid circular dependencies
@@ -137,6 +150,25 @@ class ONNXModelRunner(BaseModelRunner):
             ModelTypeEnum.dfine.value,
         ]
 
+    @staticmethod
+    def has_variable_length_inputs(model_type: str | None) -> bool:
+        """Return True for models whose input length varies between inferences.
+
+        ORT builds a memory-layout plan per unique input shape and caches it
+        indefinitely (enable_mem_pattern).  For fixed-size models (YOLO) this
+        is a single plan; for variable-length text embeddings it grows without
+        bound and must be disabled.
+        """
+        if not model_type:
+            return False
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        return model_type in [
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]
+
     @staticmethod
     def is_concurrent_model(model_type: str | None) -> bool:
         """Check if model requires thread locking for concurrent inference.
@@ -582,18 +614,22 @@ def get_optimized_runner(
         CudaGraphRunner.is_model_supported(model_type)
         and providers[0] == "CUDAExecutionProvider"
     ):
-        options[0] = {
-            **options[0],
-            "enable_cuda_graph": True,
-        }
-        return CudaGraphRunner(
-            ort.InferenceSession(
+        try:
+            cuda_graph_options = {**options[0], "enable_cuda_graph": True}
+            return CudaGraphRunner(
+                ort.InferenceSession(
+                    model_path,
+                    sess_options=get_ort_session_options(),
+                    providers=providers,
+                    provider_options=[cuda_graph_options, *options[1:]],
+                ),
+                cuda_graph_options["device_id"],
+            )
+        except Exception:
+            logger.warning(
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
                 model_path,
-                providers=providers,
-                provider_options=options,
-            ),
-            options[0]["device_id"],
-        )
+            )
 
     if (
         providers
@@ -604,11 +640,20 @@ def get_optimized_runner(
         providers.pop(0)
         options.pop(0)
 
+    if providers and providers[0] == "CUDAExecutionProvider":
+        options[0] = {
+            **options[0],
+            "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
+        }
+
     return ONNXModelRunner(
         ort.InferenceSession(
             model_path,
             sess_options=get_ort_session_options(
-                ONNXModelRunner.is_cpu_complex_model(model_type)
+                is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
+                variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                    model_type
+                ),
             ),
             providers=providers,
             provider_options=options,
diff --git a/frigate/embeddings/__init__.py b/frigate/embeddings/__init__.py
index 7e54d9703..610f03596 100644
--- a/frigate/embeddings/__init__.py
+++ b/frigate/embeddings/__init__.py
@@ -1,6 +1,7 @@
 """SQLite-vec embeddings database."""
 
 import base64
+import ctypes
 import json
 import logging
 import os
@@ -46,6 +47,16 @@ class EmbeddingProcess(FrigateProcess):
         self.metrics = metrics
 
     def run(self) -> None:
+        # Forkserver spawn exec's a fresh Python interpreter that does not
+        # inherit Docker env vars, so MALLOC_ARENA_MAX set in docker-compose
+        # never reaches this process.  Set it here via mallopt so glibc caps
+        # the number of malloc arenas to N_CPU instead of the default 8×N_CPU,
+        # preventing heap fragmentation under the embeddings workload.
+        try:
+            ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count())  # M_ARENA_MAX
+        except Exception:
+            pass
+
         self.pre_run_setup(self.config.logger)
         maintainer = EmbeddingMaintainer(
             self.config,
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
new file mode 100644
index 000000000..c914b9bdc
--- /dev/null
+++ b/frigate/test/test_detection_runners.py
@@ -0,0 +1,315 @@
+"""Tests for detection_runners session options and memory management helpers."""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+
+class TestGetOrtSessionOptions(unittest.TestCase):
+    def setUp(self):
+        import onnxruntime as ort
+
+        self.ort = ort
+
+    def test_default_disables_cpu_mem_arena(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_default_keeps_mem_pattern_enabled(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertTrue(opts.enable_mem_pattern)
+
+    def test_variable_length_inputs_disables_mem_pattern(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options(variable_length_inputs=True)
+        self.assertFalse(opts.enable_mem_pattern)
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_complex_model_sets_basic_optimization(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        opts = get_ort_session_options(is_complex_model=True)
+        self.assertEqual(
+            opts.graph_optimization_level,
+            ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+        )
+
+    def test_always_returns_session_options(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
+        self.assertIsInstance(
+            get_ort_session_options(is_complex_model=True), ort.SessionOptions
+        )
+        self.assertIsInstance(
+            get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
+        )
+
+
+class TestHasVariableLengthInputs(unittest.TestCase):
+    def test_jina_v1_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v1.value
+            )
+        )
+
+    def test_jina_v2_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v2.value
+            )
+        )
+
+    def test_paddleocr_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.paddleocr.value
+            )
+        )
+
+    def test_yolo_generic_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
+        )
+
+    def test_none_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+
+        self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
+
+
+class TestComputeCudaMemLimit(unittest.TestCase):
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
+    def test_respects_ceiling(self, mock_getsize, mock_cdll):
+        """gpu_mem_limit must not exceed 80% of total VRAM."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3  # 24 GB
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+
+        def fake_mem_get_info(free_ptr, total_ptr):
+            total_ptr._obj.value = total_vram
+            free_ptr._obj.value = total_vram
+
+        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(total_vram * 0.80))
+
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    def test_fallback_on_cuda_unavailable(self, _mock_cdll):
+        """Falls back to 4 GB when CUDA runtime is not available."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx")
+        self.assertEqual(limit, 4 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
+    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
+        """Floor must be at least 2 GB regardless of model size."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+
+        def fake_mem_get_info(free_ptr, total_ptr):
+            total_ptr._obj.value = total_vram
+            free_ptr._obj.value = total_vram
+
+        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertGreaterEqual(limit, 2 * 1024**3)
+
+
+class TestOrtLeakFixRegression(unittest.TestCase):
+    """Regression guards for the embeddings_manager ORT memory leak fix.
+
+    These tests verify that the three leak vectors identified in GitHub Discussion
+    #23007 remain fixed:
+
+      1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions
+         so host-side GPU↔CPU staging buffers are not pooled indefinitely.
+
+      2. ORT memory-pattern cache (enable_mem_pattern) — must be False for
+         variable-length embedding models (Jina, PaddleOCR) to prevent one
+         mmap-backed plan per unique sequence length from accumulating forever.
+         Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
+
+      3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
+         because forkserver spawn does not inherit Docker env vars, so setting
+         MALLOC_ARENA_MAX in docker-compose has no effect on the child process.
+    """
+
+    def test_get_optimized_runner_passes_variable_length_for_jina(self):
+        """get_optimized_runner must enable variable_length_inputs for Jina models."""
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ), patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=100 * 1024 * 1024
+        ):
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/jina.onnx",
+                device="CPU",
+                model_type=EnrichmentModelTypeEnum.jina_v2.value,
+            )
+
+        calls = mock_opts.call_args_list
+        self.assertTrue(
+            any(c.kwargs.get("variable_length_inputs") for c in calls),
+            "get_ort_session_options must be called with variable_length_inputs=True "
+            "for Jina models to prevent mmap plan cache growth",
+        )
+
+    def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
+        """get_optimized_runner must NOT set variable_length_inputs for YOLO.
+
+        Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
+        from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
+        to >4 GB and crashing CUDA graph capture.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ) as mock_session, patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=220 * 1024 * 1024
+        ):
+            mock_session.return_value.get_inputs.return_value = []
+            mock_session.return_value.get_outputs.return_value = []
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/yolov9.onnx",
+                device="CPU",
+                model_type=ModelTypeEnum.yologeneric.value,
+            )
+
+        for call in mock_opts.call_args_list:
+            self.assertFalse(
+                call.kwargs.get("variable_length_inputs", False),
+                "variable_length_inputs must not be True for YOLO — disabling "
+                "enable_mem_pattern on fixed-size models causes CUDA graph crashes",
+            )
+
+    def test_all_sessions_disable_cpu_mem_arena(self):
+        """enable_cpu_mem_arena must be False regardless of model type.
+
+        With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU
+        transfers indefinitely, causing RSS growth of hundreds of MB per hour.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        for model_type in [
+            None,
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]:
+            with self.subTest(model_type=model_type):
+                from frigate.detectors.detection_runners import ONNXModelRunner
+
+                opts = get_ort_session_options(
+                    variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                        model_type
+                    )
+                )
+                self.assertFalse(
+                    opts.enable_cpu_mem_arena,
+                    f"enable_cpu_mem_arena must be False for model_type={model_type}",
+                )
+
+    def test_embedding_process_calls_mallopt(self):
+        """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
+
+        Forkserver spawn exec's a fresh Python interpreter that does not inherit
+        Docker env vars.  MALLOC_ARENA_MAX set in docker-compose never reaches
+        the child process, so mallopt() must be called explicitly from run().
+        """
+        import frigate.embeddings as emb_module
+
+        # Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
+        with patch.object(
+            emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
+        ), patch.object(
+            emb_module.EmbeddingProcess, "pre_run_setup"
+        ), patch(
+            "ctypes.CDLL"
+        ) as mock_cdll:
+            mock_libc = MagicMock()
+            mock_cdll.return_value = mock_libc
+
+            process = emb_module.EmbeddingProcess.__new__(
+                emb_module.EmbeddingProcess
+            )
+            process.config = MagicMock()
+            process.metrics = MagicMock()
+            process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
+
+            try:
+                process.run()
+            except RuntimeError:
+                pass
+
+            mock_cdll.assert_called_with("libc.so.6")
+            mock_libc.mallopt.assert_called_once()
+            args = mock_libc.mallopt.call_args[0]
+            self.assertEqual(
+                args[0],
+                -8,  # M_ARENA_MAX
+                "mallopt must be called with M_ARENA_MAX (-8)",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/frigate/util/model.py b/frigate/util/model.py
index 338303e2d..d0b8721cc 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -1,5 +1,6 @@
 """Model Utils"""
 
+import ctypes
 import logging
 import os
 from typing import Any
@@ -283,6 +284,35 @@ def post_process_yolox(
 ### ONNX Utilities
 
 
+def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
+    """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
+
+    For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
+    requires all intermediate tensors to be live simultaneously, so peak GPU memory
+    is 15-20× the model file size and cannot be safely capped.  This function is
+    intended for embedding ONNXModelRunner sessions only.
+
+    Returns a limit derived from:
+    - Floor: model file size × peak_multiplier (≥ 2 GB)
+    - Ceiling: 80% of total GPU VRAM
+    Falls back to 4 GB if the CUDA runtime query fails.
+    """
+    try:
+        libcudart = ctypes.CDLL("libcudart.so")
+        free_bytes = ctypes.c_size_t()
+        total_bytes = ctypes.c_size_t()
+        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        total = total_bytes.value
+    except Exception:
+        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+        return 4 * 1024**3
+
+    peak_multiplier = 14 if cuda_graph else 7
+    floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
+    ceiling = int(total * 0.80)
+    return min(floor, ceiling)
+
+
 def get_ort_providers(
     force_cpu: bool = False,
     device: str | None = "AUTO",

From 1717f21f69df6f809f0cd8ceaca871be912446e2 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:44:18 -0700
Subject: [PATCH 2/8] fix: handle CUDA query failure and free-VRAM contention
 in gpu_mem_limit

Targeted follow-ups to the embeddings_manager ORT leak fix that affect all
GPU-resident embedding models (Jina text+vision, PaddleOCR det+rec, ArcFace,
YOLOv9 license plate). Detection-side YOLO runners are unaffected since
CudaGraphRunner does not call compute_cuda_mem_limit.

- compute_cuda_mem_limit now checks the cudaMemGetInfo return code instead
  of trusting that a non-throwing call populated the buffers. Previously a
  non-zero rc left both pointers at 0, producing gpu_mem_limit=0 and
  immediate session OOM rather than the documented 4 GB fallback.
- The limit also factors in currently-free VRAM (free * 0.9), not just
  total. On a shared GPU where co-resident embedding sessions have already
  consumed most of the device, capping at 80% of total still over-allocates.
- The CUDA graph fallback path now logs the underlying exception text so
  failures (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.)
  stop being swallowed by the bare except.

Tests cover all three regression paths plus updated existing tests that
now require cudaMemGetInfo to return cudaSuccess explicitly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py |   5 +-
 frigate/test/test_detection_runners.py | 108 ++++++++++++++++++++-----
 frigate/util/model.py                  |  18 +++--
 3 files changed, 103 insertions(+), 28 deletions(-)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 277b1c542..e397d73fb 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -625,10 +625,11 @@ def get_optimized_runner(
                 ),
                 cuda_graph_options["device_id"],
             )
-        except Exception:
+        except Exception as e:
             logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
                 model_path,
+                e,
             )
 
     if (
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index c914b9bdc..e8079f1c6 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase):
 
 
 class TestComputeCudaMemLimit(unittest.TestCase):
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
-    def test_respects_ceiling(self, mock_getsize, mock_cdll):
-        """gpu_mem_limit must not exceed 80% of total VRAM."""
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_respects_ceiling(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
-        total_vram = 24 * 1024**3  # 24 GB
+        total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertLessEqual(limit, int(total_vram * 0.80))
 
     @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
     def test_fallback_on_cuda_unavailable(self, _mock_cdll):
-        """Falls back to 4 GB when CUDA runtime is not available."""
         from frigate.util.model import compute_cuda_mem_limit
 
         limit = compute_cuda_mem_limit("/fake/model.onnx")
         self.assertEqual(limit, 4 * 1024**3)
 
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
-    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
-        """Floor must be at least 2 GB regardless of model size."""
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)
+    def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
         total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertGreaterEqual(limit, 2 * 1024**3)
 
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+        # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
+        # producing gpu_mem_limit=0 and immediate session OOM.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertEqual(limit, 4 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
+        # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
+        # not 80% of total — co-resident embedding sessions would OOM otherwise.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            3 * 1024**3, 24 * 1024**3
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
+
 
 class TestOrtLeakFixRegression(unittest.TestCase):
     """Regression guards for the embeddings_manager ORT memory leak fix.
@@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase):
             )
 
 
+class TestCudaGraphFallbackLogsException(unittest.TestCase):
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_exception_text(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Concern #1: the bare `except Exception:` swallowed the underlying
+        # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
+        # turning a debuggable failure into an opaque "fell back to ONNX runner".
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("cudaErrorStreamCaptureUnsupported"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn("CUDA graph capture failed", joined)
+        self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/frigate/util/model.py b/frigate/util/model.py
index d0b8721cc..9867115a3 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
 
     Returns a limit derived from:
     - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: 80% of total GPU VRAM
+    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
     Falls back to 4 GB if the CUDA runtime query fails.
     """
     try:
         libcudart = ctypes.CDLL("libcudart.so")
         free_bytes = ctypes.c_size_t()
         total_bytes = ctypes.c_size_t()
-        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        rc = libcudart.cudaMemGetInfo(
+            ctypes.byref(free_bytes), ctypes.byref(total_bytes)
+        )
+        if rc != 0 or total_bytes.value == 0:
+            raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
         total = total_bytes.value
-    except Exception:
-        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+        free = free_bytes.value
+    except Exception as e:
+        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
         return 4 * 1024**3
 
     peak_multiplier = 14 if cuda_graph else 7
     floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
-    ceiling = int(total * 0.80)
-    return min(floor, ceiling)
+    # Honor free VRAM so co-resident embedding sessions (jina text + vision,
+    # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
+    return min(floor, int(total * 0.80), int(free * 0.90))
 
 
 def get_ort_providers(

From cf8638f260b9c6f4b3dec437a4b1eb5e7de4eeb2 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:51:39 -0700
Subject: [PATCH 3/8] fix: explicitly set enable_mem_pattern for fixed-size
 models

Previously relied on ORT's implicit default (True) for fixed-size models,
only flipping the flag in the variable-length branch. Set it explicitly
in both branches to be robust against ORT default changes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index e397d73fb..6e9eeca76 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -32,7 +32,7 @@ def get_ort_session_options(
         is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
         variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
             embeddings).  When True, disables memory-pattern caching, which otherwise builds
-            a plan per unique input shape and holds onto mmap regions indefinitely — a major
+            a plan per unique input shape and holds onto mmap regions indefinitely - a major
             source of RSS growth in the embeddings_manager process.
 
     Returns:
@@ -40,15 +40,18 @@ def get_ort_session_options(
     """
     sess_options = ort.SessionOptions()
     # Disable the CPU BFC arena for all sessions.  With the arena enabled ORT pools
-    # host-side staging buffers for GPU↔CPU transfers and never releases them back to
+    # host-side staging buffers for GPU -> CPU transfers and never releases them back to
     # the OS, causing RSS to grow without bound in long-running embedding processes.
     sess_options.enable_cpu_mem_arena = False
     if variable_length_inputs:
         # Disable per-shape memory-layout plan caching for models with variable-length
         # inputs (Jina CLIP text, PaddleOCR).  Each unique sequence length creates a
         # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
-        # Fixed-size models (YOLO at 640×640) should keep this enabled for buffer aliasing.
         sess_options.enable_mem_pattern = False
+    else:
+        # Fixed-size models (like YOLO ) keep mem_pattern on for buffer aliasing.
+        # Set explicitly to be robust against ORT default changes.
+        sess_options.enable_mem_pattern = True
     if is_complex_model:
         sess_options.graph_optimization_level = (
             ort.GraphOptimizationLevel.ORT_ENABLE_BASIC

From 6a16fa667b7993eba26f08fdd7799b5898ced893 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:52:23 -0700
Subject: [PATCH 4/8] fix: explicitly set graph_optimization_level for
 non-complex models

Same robustness pattern as the previous mem_pattern commit: previously
inherited ORT's implicit default (ORT_ENABLE_ALL) when is_complex_model
was False. Set explicitly in both branches so the behavior survives any
future ORT default change.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 6e9eeca76..922c7748e 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -56,6 +56,12 @@ def get_ort_session_options(
         sess_options.graph_optimization_level = (
             ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
         )
+    else:
+        # Most models tolerate aggressive fusions; set explicitly to be robust
+        # against ORT default changes.
+        sess_options.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
     return sess_options
 
 

From 351bef936f9fbdd444cb56af6722891f7024028f Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 23:58:06 -0700
Subject: [PATCH 5/8] fix: omit gpu_mem_limit on CUDA query failure instead of
 guessing 4 GB

When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit
now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to
manage its own arena (grow-as-needed up to device capacity).

Tradeoff documented in the docstring:
- Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices
  (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken
  /dev/nvidia* container passthroughs, where requesting 4 GB caused
  cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs
  (24 GB RTX 3090 with 20 GB free), needlessly starving the session.
- The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are
  independent of the BFC arena cap, so dropping the cap on the failure path
  does not reintroduce the leak.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py | 13 ++++---
 frigate/test/test_detection_runners.py | 53 ++++++++++++++++++++++----
 frigate/util/model.py                  | 35 ++++++++++++-----
 3 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 922c7748e..09f88a116 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -636,8 +636,12 @@ def get_optimized_runner(
             )
         except Exception as e:
             logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
+                "CUDA graph capture failed for model_type=%s path=%s "
+                "device_id=%s providers=%s; falling back to standard ONNX runner: %s",
+                model_type,
                 model_path,
+                cuda_graph_options.get("device_id"),
+                providers,
                 e,
             )
 
@@ -651,10 +655,9 @@ def get_optimized_runner(
         options.pop(0)
 
     if providers and providers[0] == "CUDAExecutionProvider":
-        options[0] = {
-            **options[0],
-            "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
-        }
+        gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
+        if gpu_mem_limit is not None:
+            options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
 
     return ONNXModelRunner(
         ort.InferenceSession(
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index e8079f1c6..4f11d7afb 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -125,11 +125,12 @@ class TestComputeCudaMemLimit(unittest.TestCase):
         self.assertLessEqual(limit, int(total_vram * 0.80))
 
     @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
-    def test_fallback_on_cuda_unavailable(self, _mock_cdll):
+    def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
+        # See compute_cuda_mem_limit docstring for the tradeoff: returning a
+        # hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
         from frigate.util.model import compute_cuda_mem_limit
 
-        limit = compute_cuda_mem_limit("/fake/model.onnx")
-        self.assertEqual(limit, 4 * 1024**3)
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=50 * 1024 * 1024)
@@ -148,17 +149,17 @@ class TestComputeCudaMemLimit(unittest.TestCase):
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=200 * 1024 * 1024)
-    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+    def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
         # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
-        # producing gpu_mem_limit=0 and immediate session OOM.
+        # producing gpu_mem_limit=0 and immediate session OOM. We now return
+        # None so the caller omits gpu_mem_limit and ORT manages the arena.
         from frigate.util.model import compute_cuda_mem_limit
 
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
         mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
 
-        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
-        self.assertEqual(limit, 4 * 1024**3)
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
 
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=200 * 1024 * 1024)
@@ -341,6 +342,44 @@ class TestOrtLeakFixRegression(unittest.TestCase):
             )
 
 
+class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
+    """When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
+    inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_no_gpu_mem_limit_key_when_cuda_query_fails(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        mock_session.return_value.get_inputs.return_value = []
+        mock_session.return_value.get_outputs.return_value = []
+
+        get_optimized_runner(
+            "/fake/jina.onnx",
+            device="GPU",
+            model_type=EnrichmentModelTypeEnum.jina_v2.value,
+        )
+
+        provider_opts = mock_session.call_args.kwargs["provider_options"]
+        self.assertNotIn(
+            "gpu_mem_limit",
+            provider_opts[0],
+            "Must omit (not set to 0, not set to a guess) when query fails",
+        )
+
+
 class TestCudaGraphFallbackLogsException(unittest.TestCase):
     @patch("frigate.detectors.detection_runners.ort.InferenceSession")
     @patch(
diff --git a/frigate/util/model.py b/frigate/util/model.py
index 9867115a3..ac1cfe226 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -284,18 +284,33 @@ def post_process_yolox(
 ### ONNX Utilities
 
 
-def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
+def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
     """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
 
-    For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
+    For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
     requires all intermediate tensors to be live simultaneously, so peak GPU memory
-    is 15-20× the model file size and cannot be safely capped.  This function is
+    is 15-20x the model file size and cannot be safely capped.  This function is
     intended for embedding ONNXModelRunner sessions only.
 
     Returns a limit derived from:
-    - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
-    Falls back to 4 GB if the CUDA runtime query fails.
+    - min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
+
+    Returns None if the CUDA runtime query fails. The caller MUST then omit
+    gpu_mem_limit from provider_options so ORT falls back to its own default
+    (grow-as-needed up to device capacity).
+
+    Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
+    but that number is wrong for both ends of the spectrum:
+      - On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
+        any container where /dev/nvidia* passthrough is broken, asking for 4 GB
+        causes ORT session init to fail with cudaErrorMemoryAllocation.
+      - On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
+        starves the session and forces extra arena reallocations.
+    Returning None and letting ORT manage the arena itself is the
+    least-surprising behavior when we cannot actually measure VRAM. The
+    leak vectors this PR addresses (mem_pattern, mallopt) are independent
+    of the BFC arena cap, so dropping the cap on the failure path does
+    not reintroduce the leak.
     """
     try:
         libcudart = ctypes.CDLL("libcudart.so")
@@ -309,14 +324,14 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
         total = total_bytes.value
         free = free_bytes.value
     except Exception as e:
-        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
-        return 4 * 1024**3
+        logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
+        return None
 
     peak_multiplier = 14 if cuda_graph else 7
-    floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
+    desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
     # Honor free VRAM so co-resident embedding sessions (jina text + vision,
     # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
-    return min(floor, int(total * 0.80), int(free * 0.90))
+    return min(desired, int(total * 0.80), int(free * 0.90))
 
 
 def get_ort_providers(

From 3c6937c71fce7bcb4dd8ff35ef4258ce727cb885 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sun, 3 May 2026 21:46:58 -0700
Subject: [PATCH 6/8] test: close coverage gaps in ORT leak fix regression
 suite

Audit of test_detection_runners.py against the actual fix changes
surfaced five cases where the test suite would silently pass after
a regression. Adds:

- test_fallback_warning_includes_developer_context: guards the
  enriched CUDA-graph fallback warning fields (model_type, path,
  device_id, providers) against revert to the bare form.
- test_default_sets_enable_all_optimization: guards the explicit
  else branch that pins graph_optimization_level=ORT_ENABLE_ALL,
  added to be robust against ORT default changes.
- test_gpu_mem_limit_key_present_when_cuda_query_succeeds: positive
  counterpart to the existing omit-on-failure test.
- test_cuda_graph_doubles_peak_multiplier: locks in the 7 -> 14
  multiplier relationship in compute_cuda_mem_limit.
- test_arcface_is_fixed / test_facenet_is_fixed /
  test_yolov9_license_plate_is_fixed: explicit fixed-size guards
  for the remaining enrichment models the fix targets.

28 tests pass (was 21).

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 frigate/test/test_detection_runners.py | 152 +++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index 4f11d7afb..2c593f7a0 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -40,6 +40,19 @@ class TestGetOrtSessionOptions(unittest.TestCase):
             ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
         )
 
+    def test_default_sets_enable_all_optimization(self):
+        # Guards the explicit `else` branch added so the optimization level is
+        # never implicit — protects against ORT default changes.
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        opts = get_ort_session_options()
+        self.assertEqual(
+            opts.graph_optimization_level,
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
+        )
+
     def test_always_returns_session_options(self):
         from frigate.detectors.detection_runners import get_ort_session_options
 
@@ -98,6 +111,36 @@ class TestHasVariableLengthInputs(unittest.TestCase):
 
         self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
 
+    def test_arcface_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.arcface.value
+            )
+        )
+
+    def test_facenet_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.facenet.value
+            )
+        )
+
+    def test_yolov9_license_plate_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.yolov9_license_plate.value
+            )
+        )
+
 
 class TestComputeCudaMemLimit(unittest.TestCase):
     @staticmethod
@@ -161,6 +204,27 @@ class TestComputeCudaMemLimit(unittest.TestCase):
 
         self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
 
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=500 * 1024 * 1024)
+    def test_cuda_graph_doubles_peak_multiplier(self, _mock_getsize, mock_cdll):
+        # cuda_graph=True must use peak_multiplier=14 (vs 7 for cuda_graph=False)
+        # because graph capture pins all intermediate tensors live simultaneously.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+
+        model_size = 500 * 1024 * 1024
+        with_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=True)
+        without_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertGreaterEqual(with_graph, model_size * 14)
+        self.assertGreaterEqual(without_graph, model_size * 7)
+        self.assertGreater(with_graph, without_graph)
+
     @patch("frigate.util.model.ctypes.CDLL")
     @patch("os.path.getsize", return_value=200 * 1024 * 1024)
     def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
@@ -380,6 +444,57 @@ class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
         )
 
 
+class TestRunnerInjectsGpuMemLimitOnCudaQuerySuccess(unittest.TestCase):
+    """Positive counterpart to TestRunnerOmitsGpuMemLimitOnCudaQueryFailure:
+    when cudaMemGetInfo succeeds, gpu_mem_limit must be injected into
+    provider_options so ORT's BFC arena is bounded."""
+
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_gpu_mem_limit_key_present_when_cuda_query_succeeds(
+        self, _gs, mock_cdll, _rknn, _gp, mock_session
+    ):
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+        mock_session.return_value.get_inputs.return_value = []
+        mock_session.return_value.get_outputs.return_value = []
+
+        get_optimized_runner(
+            "/fake/jina.onnx",
+            device="GPU",
+            model_type=EnrichmentModelTypeEnum.jina_v2.value,
+        )
+
+        provider_opts = mock_session.call_args.kwargs["provider_options"]
+        self.assertIn("gpu_mem_limit", provider_opts[0])
+        self.assertGreater(provider_opts[0]["gpu_mem_limit"], 0)
+
+
 class TestCudaGraphFallbackLogsException(unittest.TestCase):
     @patch("frigate.detectors.detection_runners.ort.InferenceSession")
     @patch(
@@ -417,6 +532,43 @@ class TestCudaGraphFallbackLogsException(unittest.TestCase):
         self.assertIn("CUDA graph capture failed", joined)
         self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
 
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_developer_context(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Guards the enriched warning fields (model_type, device_id, providers)
+        # so a future revert to the bare "model_path + e" form is caught.
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("boom"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn(f"model_type={ModelTypeEnum.yologeneric.value}", joined)
+        self.assertIn("path=/m/yolo.onnx", joined)
+        self.assertIn("device_id=0", joined)
+        self.assertIn("CUDAExecutionProvider", joined)
+
 
 if __name__ == "__main__":
     unittest.main()

From 62ad2b09f786de5f5ced55281a3065a36b1ad9ca Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sun, 3 May 2026 21:59:40 -0700
Subject: [PATCH 7/8] docs: clarify why mallopt is preferred over
 MALLOC_ARENA_MAX env var
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous wording ("forkserver spawn does not inherit Docker env
vars") was technically inaccurate — multiprocessing's spawn/forkserver
does pass the parent's environment via execve.  The real reasons
in-process mallopt is the right fix:

- glibc reads MALLOC_ARENA_MAX only once, at malloc init, before the
  Python interpreter is up.  Even if the env var arrives, it has to
  be present before the very first malloc call.
- s6-overlay service supervision (s6-setuidgid / s6-envuidgid) can
  filter the env passed to the supervised process; relying on it is
  brittle.
- mallopt(M_ARENA_MAX, n_cpu) is the official runtime equivalent and
  works regardless of how the process was spawned.

Updates the comment in EmbeddingProcess.run() and the matching test
docstrings.  No behavior change.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 frigate/embeddings/__init__.py         | 11 +++++++----
 frigate/test/test_detection_runners.py | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/frigate/embeddings/__init__.py b/frigate/embeddings/__init__.py
index 610f03596..4f964343c 100644
--- a/frigate/embeddings/__init__.py
+++ b/frigate/embeddings/__init__.py
@@ -47,10 +47,13 @@ class EmbeddingProcess(FrigateProcess):
         self.metrics = metrics
 
     def run(self) -> None:
-        # Forkserver spawn exec's a fresh Python interpreter that does not
-        # inherit Docker env vars, so MALLOC_ARENA_MAX set in docker-compose
-        # never reaches this process.  Set it here via mallopt so glibc caps
-        # the number of malloc arenas to N_CPU instead of the default 8×N_CPU,
+        # glibc reads MALLOC_ARENA_MAX only once, at malloc init - before this
+        # Python interpreter is even up.  Setting it via docker-compose is
+        # brittle: it has to survive the s6-overlay service-supervision chain
+        # (which can filter env via s6-setuidgid/s6-envuidgid) and arrive
+        # before the very first malloc call.  Calling mallopt(M_ARENA_MAX, n_cpu)
+        # here is the runtime equivalent and works regardless of how we were
+        # spawned, capping arenas at N_CPU instead of the default 8×N_CPU and
         # preventing heap fragmentation under the embeddings workload.
         try:
             ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count())  # M_ARENA_MAX
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index 2c593f7a0..c0d640f0c 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -257,8 +257,9 @@ class TestOrtLeakFixRegression(unittest.TestCase):
          Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
 
       3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
-         because forkserver spawn does not inherit Docker env vars, so setting
-         MALLOC_ARENA_MAX in docker-compose has no effect on the child process.
+         because glibc reads MALLOC_ARENA_MAX once at malloc init, and the env
+         var is brittle to deliver through s6-overlay supervision before that
+         point.  In-process mallopt is the runtime-safe equivalent.
     """
 
     def test_get_optimized_runner_passes_variable_length_for_jina(self):
@@ -367,9 +368,10 @@ class TestOrtLeakFixRegression(unittest.TestCase):
     def test_embedding_process_calls_mallopt(self):
         """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
 
-        Forkserver spawn exec's a fresh Python interpreter that does not inherit
-        Docker env vars.  MALLOC_ARENA_MAX set in docker-compose never reaches
-        the child process, so mallopt() must be called explicitly from run().
+        glibc reads MALLOC_ARENA_MAX only at malloc init, before this Python
+        interpreter is up, and the env var is brittle to deliver through the
+        s6-overlay service-supervision chain before that point.  mallopt()
+        is the runtime-safe equivalent and must be called explicitly from run().
         """
         import frigate.embeddings as emb_module
 

From 6fa6c5a84d6144314854589d18d7b1f0dfdc2b63 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sun, 3 May 2026 22:24:00 -0700
Subject: [PATCH 8/8] test: fail loudly when EnrichmentModelTypeEnum gains an
 unclassified value

The current pattern - hardcoded include-lists in
ONNXModelRunner.has_variable_length_inputs and friends - silently
defaults any unknown model to "fixed-size, simple, single-threaded".
For has_variable_length_inputs that default re-introduces the ORT
mem-pattern leak when the new model is actually variable-length.

Adds test_every_enrichment_model_is_explicitly_classified, which
iterates EnrichmentModelTypeEnum and asserts every member is in one
of two explicit sets the test author maintains.  A new enum value
without classification fails CI with a message naming the classifier
to update.  Also documents the same constraint on the enum itself
with a TODO pointing at the longer-term fix (a co-located MODEL_TRAITS
registry covering all five classifiers, not just this one).

Verified by injecting a fake enum value: the test fails with the
expected pointer to the missing classifier.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 frigate/embeddings/types.py            | 11 +++++++
 frigate/test/test_detection_runners.py | 43 ++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/frigate/embeddings/types.py b/frigate/embeddings/types.py
index 32cbe5dd0..c66a6e2ad 100644
--- a/frigate/embeddings/types.py
+++ b/frigate/embeddings/types.py
@@ -7,6 +7,17 @@ class EmbeddingTypeEnum(str, Enum):
 
 
 class EnrichmentModelTypeEnum(str, Enum):
+    # When adding a value, audit every classifier that switches on it:
+    #   - ONNXModelRunner.has_variable_length_inputs
+    #   - ONNXModelRunner.is_cpu_complex_model
+    #   - ONNXModelRunner.is_migraphx_complex_model
+    #   - ONNXModelRunner.is_concurrent_model
+    #   - CudaGraphRunner.is_model_supported
+    # The default for omission is "fixed-size, simple, single-threaded" - which
+    # silently re-introduces the ORT mem-pattern leak if the new model is
+    # actually variable-length (Jina/PaddleOCR-class).
+    # TODO: replace these scattered include-lists with a single MODEL_TRAITS
+    # registry co-located with the enum so adding a value forces classification.
     arcface = "arcface"
     facenet = "facenet"
     jina_v1 = "jina_v1"
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index c0d640f0c..85d90ed76 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -141,6 +141,49 @@ class TestHasVariableLengthInputs(unittest.TestCase):
             )
         )
 
+    def test_every_enrichment_model_is_explicitly_classified(self):
+        """Every EnrichmentModelTypeEnum value must be deliberately classified.
+
+        Adding a new model to the enum without updating has_variable_length_inputs
+        silently defaults it to fixed-size (mem_pattern stays on), which
+        re-introduces the ORT mmap-plan leak if the new model is actually
+        variable-length.  This test fails on any unclassified enum value so the
+        author is forced to make a deliberate decision.
+
+        TODO: replace this guard with a single MODEL_TRAITS registry co-located
+        with EnrichmentModelTypeEnum so adding a value mechanically forces
+        classification across every classifier (variable-length, cpu_complex,
+        migraphx_complex, concurrent, cuda_graph_supported), not just this one.
+        """
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        VARIABLE_LENGTH = {
+            EnrichmentModelTypeEnum.jina_v1,
+            EnrichmentModelTypeEnum.jina_v2,
+            EnrichmentModelTypeEnum.paddleocr,
+        }
+        FIXED_LENGTH = {
+            EnrichmentModelTypeEnum.arcface,
+            EnrichmentModelTypeEnum.facenet,
+            EnrichmentModelTypeEnum.yolov9_license_plate,
+        }
+        classified = VARIABLE_LENGTH | FIXED_LENGTH
+        for member in EnrichmentModelTypeEnum:
+            self.assertIn(
+                member,
+                classified,
+                f"{member.value} is not explicitly classified — audit "
+                "ONNXModelRunner.has_variable_length_inputs (and the other "
+                "classifiers listed in EnrichmentModelTypeEnum's docstring).",
+            )
+            self.assertEqual(
+                ONNXModelRunner.has_variable_length_inputs(member.value),
+                member in VARIABLE_LENGTH,
+                f"{member.value}: classification disagrees with "
+                "has_variable_length_inputs — update one or the other.",
+            )
+
 
 class TestComputeCudaMemLimit(unittest.TestCase):
     @staticmethod