Merge 740f2e9b68 into 76a1230885

2026-05-07 05:55:27 +03:00 · 2026-05-05 23:36:47 +02:00 · 2026-05-05 23:36:47 +02:00 · 6e084e4aca
commit 6e084e4aca
parent 76a1230885 740f2e9b68
5 changed files with 772 additions and 19 deletions
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@ -10,7 +10,7 @@ from typing import Any
 import numpy as np
 import onnxruntime as ort

-from frigate.util.model import get_ort_providers
+from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
 from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible

 logger = logging.getLogger(__name__)
@ -24,24 +24,46 @@ def is_arm64_platform() -> bool:

 def get_ort_session_options(
    is_complex_model: bool = False,
-) -> ort.SessionOptions | None:
+    variable_length_inputs: bool = False,
+) -> ort.SessionOptions:
    """Get ONNX Runtime session options with appropriate settings.

    Args:
        is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
+        variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
+            embeddings).  When True, disables memory-pattern caching, which otherwise builds
+            a plan per unique input shape and holds onto mmap regions indefinitely - a major
+            source of RSS growth in the embeddings_manager process.

    Returns:
-        SessionOptions with appropriate optimization level, or None for default settings.
+        SessionOptions with appropriate settings.
    """
-    if is_complex_model:
    sess_options = ort.SessionOptions()
+    # Disable the CPU BFC arena for all sessions.  With the arena enabled ORT pools
+    # host-side staging buffers for GPU -> CPU transfers and never releases them back to
+    # the OS, causing RSS to grow without bound in long-running embedding processes.
+    sess_options.enable_cpu_mem_arena = False
+    if variable_length_inputs:
+        # Disable per-shape memory-layout plan caching for models with variable-length
+        # inputs (Jina CLIP text, PaddleOCR).  Each unique sequence length creates a
+        # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
+        sess_options.enable_mem_pattern = False
+    else:
+        # Fixed-size models (like YOLO ) keep mem_pattern on for buffer aliasing.
+        # Set explicitly to be robust against ORT default changes.
+        sess_options.enable_mem_pattern = True
+    if is_complex_model:
        sess_options.graph_optimization_level = (
            ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
        )
+    else:
+        # Most models tolerate aggressive fusions; set explicitly to be robust
+        # against ORT default changes.
+        sess_options.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
    return sess_options

-    return None
-

 # Import OpenVINO only when needed to avoid circular dependencies
 try:
@ -136,6 +158,25 @@ class ONNXModelRunner(BaseModelRunner):
            ModelTypeEnum.dfine.value,
        ]

+    @staticmethod
+    def has_variable_length_inputs(model_type: str | None) -> bool:
+        """Return True for models whose input length varies between inferences.
+
+        ORT builds a memory-layout plan per unique input shape and caches it
+        indefinitely (enable_mem_pattern).  For fixed-size models (YOLO) this
+        is a single plan; for variable-length text embeddings it grows without
+        bound and must be disabled.
+        """
+        if not model_type:
+            return False
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        return model_type in [
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]
+
    @staticmethod
    def is_concurrent_model(model_type: str | None) -> bool:
        """Check if model requires thread locking for concurrent inference.
@ -581,17 +622,26 @@ def get_optimized_runner(
        CudaGraphRunner.is_model_supported(model_type)
        and providers[0] == "CUDAExecutionProvider"
    ):
-        options[0] = {
-            **options[0],
-            "enable_cuda_graph": True,
-        }
+        try:
+            cuda_graph_options = {**options[0], "enable_cuda_graph": True}
            return CudaGraphRunner(
                ort.InferenceSession(
                    model_path,
+                    sess_options=get_ort_session_options(),
                    providers=providers,
-                provider_options=options,
+                    provider_options=[cuda_graph_options, *options[1:]],
                ),
-            options[0]["device_id"],
+                cuda_graph_options["device_id"],
+            )
+        except Exception as e:
+            logger.warning(
+                "CUDA graph capture failed for model_type=%s path=%s "
+                "device_id=%s providers=%s; falling back to standard ONNX runner: %s",
+                model_type,
+                model_path,
+                cuda_graph_options.get("device_id"),
+                providers,
+                e,
            )

    if (
@ -603,11 +653,19 @@ def get_optimized_runner(
        providers.pop(0)
        options.pop(0)

+    if providers and providers[0] == "CUDAExecutionProvider":
+        gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
+        if gpu_mem_limit is not None:
+            options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
+
    return ONNXModelRunner(
        ort.InferenceSession(
            model_path,
            sess_options=get_ort_session_options(
-                ONNXModelRunner.is_cpu_complex_model(model_type)
+                is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
+                variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                    model_type
+                ),
            ),
            providers=providers,
            provider_options=options,
--- a/frigate/embeddings/init.py
+++ b/frigate/embeddings/init.py
@ -1,6 +1,7 @@
 """SQLite-vec embeddings database."""

 import base64
+import ctypes
 import json
 import logging
 import os
@ -46,6 +47,19 @@ class EmbeddingProcess(FrigateProcess):
        self.metrics = metrics

    def run(self) -> None:
+        # glibc reads MALLOC_ARENA_MAX only once, at malloc init - before this
+        # Python interpreter is even up.  Setting it via docker-compose is
+        # brittle: it has to survive the s6-overlay service-supervision chain
+        # (which can filter env via s6-setuidgid/s6-envuidgid) and arrive
+        # before the very first malloc call.  Calling mallopt(M_ARENA_MAX, n_cpu)
+        # here is the runtime equivalent and works regardless of how we were
+        # spawned, capping arenas at N_CPU instead of the default 8×N_CPU and
+        # preventing heap fragmentation under the embeddings workload.
+        try:
+            ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count())  # M_ARENA_MAX
+        except Exception:
+            pass
+
        self.pre_run_setup(self.config.logger)
        maintainer = EmbeddingMaintainer(
            self.config,
--- a/frigate/embeddings/types.py
+++ b/frigate/embeddings/types.py
@ -7,6 +7,17 @@ class EmbeddingTypeEnum(str, Enum):


 class EnrichmentModelTypeEnum(str, Enum):
+    # When adding a value, audit every classifier that switches on it:
+    #   - ONNXModelRunner.has_variable_length_inputs
+    #   - ONNXModelRunner.is_cpu_complex_model
+    #   - ONNXModelRunner.is_migraphx_complex_model
+    #   - ONNXModelRunner.is_concurrent_model
+    #   - CudaGraphRunner.is_model_supported
+    # The default for omission is "fixed-size, simple, single-threaded" - which
+    # silently re-introduces the ORT mem-pattern leak if the new model is
+    # actually variable-length (Jina/PaddleOCR-class).
+    # TODO: replace these scattered include-lists with a single MODEL_TRAITS
+    # registry co-located with the enum so adding a value forces classification.
    arcface = "arcface"
    facenet = "facenet"
    jina_v1 = "jina_v1"
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@ -0,0 +1,619 @@
+"""Tests for detection_runners session options and memory management helpers."""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+
+class TestGetOrtSessionOptions(unittest.TestCase):
+    def setUp(self):
+        import onnxruntime as ort
+
+        self.ort = ort
+
+    def test_default_disables_cpu_mem_arena(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_default_keeps_mem_pattern_enabled(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertTrue(opts.enable_mem_pattern)
+
+    def test_variable_length_inputs_disables_mem_pattern(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options(variable_length_inputs=True)
+        self.assertFalse(opts.enable_mem_pattern)
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_complex_model_sets_basic_optimization(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        opts = get_ort_session_options(is_complex_model=True)
+        self.assertEqual(
+            opts.graph_optimization_level,
+            ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+        )
+
+    def test_default_sets_enable_all_optimization(self):
+        # Guards the explicit `else` branch added so the optimization level is
+        # never implicit — protects against ORT default changes.
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        opts = get_ort_session_options()
+        self.assertEqual(
+            opts.graph_optimization_level,
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
+        )
+
+    def test_always_returns_session_options(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
+        self.assertIsInstance(
+            get_ort_session_options(is_complex_model=True), ort.SessionOptions
+        )
+        self.assertIsInstance(
+            get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
+        )
+
+
+class TestHasVariableLengthInputs(unittest.TestCase):
+    def test_jina_v1_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v1.value
+            )
+        )
+
+    def test_jina_v2_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v2.value
+            )
+        )
+
+    def test_paddleocr_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.paddleocr.value
+            )
+        )
+
+    def test_yolo_generic_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
+        )
+
+    def test_none_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+
+        self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
+
+    def test_arcface_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.arcface.value
+            )
+        )
+
+    def test_facenet_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.facenet.value
+            )
+        )
+
+    def test_yolov9_license_plate_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.yolov9_license_plate.value
+            )
+        )
+
+    def test_every_enrichment_model_is_explicitly_classified(self):
+        """Every EnrichmentModelTypeEnum value must be deliberately classified.
+
+        Adding a new model to the enum without updating has_variable_length_inputs
+        silently defaults it to fixed-size (mem_pattern stays on), which
+        re-introduces the ORT mmap-plan leak if the new model is actually
+        variable-length.  This test fails on any unclassified enum value so the
+        author is forced to make a deliberate decision.
+
+        TODO: replace this guard with a single MODEL_TRAITS registry co-located
+        with EnrichmentModelTypeEnum so adding a value mechanically forces
+        classification across every classifier (variable-length, cpu_complex,
+        migraphx_complex, concurrent, cuda_graph_supported), not just this one.
+        """
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        VARIABLE_LENGTH = {
+            EnrichmentModelTypeEnum.jina_v1,
+            EnrichmentModelTypeEnum.jina_v2,
+            EnrichmentModelTypeEnum.paddleocr,
+        }
+        FIXED_LENGTH = {
+            EnrichmentModelTypeEnum.arcface,
+            EnrichmentModelTypeEnum.facenet,
+            EnrichmentModelTypeEnum.yolov9_license_plate,
+        }
+        classified = VARIABLE_LENGTH | FIXED_LENGTH
+        for member in EnrichmentModelTypeEnum:
+            self.assertIn(
+                member,
+                classified,
+                f"{member.value} is not explicitly classified — audit "
+                "ONNXModelRunner.has_variable_length_inputs (and the other "
+                "classifiers listed in EnrichmentModelTypeEnum's docstring).",
+            )
+            self.assertEqual(
+                ONNXModelRunner.has_variable_length_inputs(member.value),
+                member in VARIABLE_LENGTH,
+                f"{member.value}: classification disagrees with "
+                "has_variable_length_inputs — update one or the other.",
+            )
+
+
+class TestComputeCudaMemLimit(unittest.TestCase):
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_respects_ceiling(self, _mock_getsize, mock_cdll):
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(total_vram * 0.80))
+
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
+        # See compute_cuda_mem_limit docstring for the tradeoff: returning a
+        # hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
+        from frigate.util.model import compute_cuda_mem_limit
+
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)
+    def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertGreaterEqual(limit, 2 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+        # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
+        # producing gpu_mem_limit=0 and immediate session OOM. We now return
+        # None so the caller omits gpu_mem_limit and ORT manages the arena.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
+
+        self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=500 * 1024 * 1024)
+    def test_cuda_graph_doubles_peak_multiplier(self, _mock_getsize, mock_cdll):
+        # cuda_graph=True must use peak_multiplier=14 (vs 7 for cuda_graph=False)
+        # because graph capture pins all intermediate tensors live simultaneously.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+
+        model_size = 500 * 1024 * 1024
+        with_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=True)
+        without_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertGreaterEqual(with_graph, model_size * 14)
+        self.assertGreaterEqual(without_graph, model_size * 7)
+        self.assertGreater(with_graph, without_graph)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
+        # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
+        # not 80% of total — co-resident embedding sessions would OOM otherwise.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            3 * 1024**3, 24 * 1024**3
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
+
+
+class TestOrtLeakFixRegression(unittest.TestCase):
+    """Regression guards for the embeddings_manager ORT memory leak fix.
+
+    These tests verify that the three leak vectors identified in GitHub Discussion
+    #23007 remain fixed:
+
+      1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions
+         so host-side GPU↔CPU staging buffers are not pooled indefinitely.
+
+      2. ORT memory-pattern cache (enable_mem_pattern) — must be False for
+         variable-length embedding models (Jina, PaddleOCR) to prevent one
+         mmap-backed plan per unique sequence length from accumulating forever.
+         Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
+
+      3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
+         because glibc reads MALLOC_ARENA_MAX once at malloc init, and the env
+         var is brittle to deliver through s6-overlay supervision before that
+         point.  In-process mallopt is the runtime-safe equivalent.
+    """
+
+    def test_get_optimized_runner_passes_variable_length_for_jina(self):
+        """get_optimized_runner must enable variable_length_inputs for Jina models."""
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ), patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=100 * 1024 * 1024
+        ):
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/jina.onnx",
+                device="CPU",
+                model_type=EnrichmentModelTypeEnum.jina_v2.value,
+            )
+
+        calls = mock_opts.call_args_list
+        self.assertTrue(
+            any(c.kwargs.get("variable_length_inputs") for c in calls),
+            "get_ort_session_options must be called with variable_length_inputs=True "
+            "for Jina models to prevent mmap plan cache growth",
+        )
+
+    def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
+        """get_optimized_runner must NOT set variable_length_inputs for YOLO.
+
+        Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
+        from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
+        to >4 GB and crashing CUDA graph capture.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ) as mock_session, patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=220 * 1024 * 1024
+        ):
+            mock_session.return_value.get_inputs.return_value = []
+            mock_session.return_value.get_outputs.return_value = []
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/yolov9.onnx",
+                device="CPU",
+                model_type=ModelTypeEnum.yologeneric.value,
+            )
+
+        for call in mock_opts.call_args_list:
+            self.assertFalse(
+                call.kwargs.get("variable_length_inputs", False),
+                "variable_length_inputs must not be True for YOLO — disabling "
+                "enable_mem_pattern on fixed-size models causes CUDA graph crashes",
+            )
+
+    def test_all_sessions_disable_cpu_mem_arena(self):
+        """enable_cpu_mem_arena must be False regardless of model type.
+
+        With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU
+        transfers indefinitely, causing RSS growth of hundreds of MB per hour.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        for model_type in [
+            None,
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]:
+            with self.subTest(model_type=model_type):
+                from frigate.detectors.detection_runners import ONNXModelRunner
+
+                opts = get_ort_session_options(
+                    variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                        model_type
+                    )
+                )
+                self.assertFalse(
+                    opts.enable_cpu_mem_arena,
+                    f"enable_cpu_mem_arena must be False for model_type={model_type}",
+                )
+
+    def test_embedding_process_calls_mallopt(self):
+        """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
+
+        glibc reads MALLOC_ARENA_MAX only at malloc init, before this Python
+        interpreter is up, and the env var is brittle to deliver through the
+        s6-overlay service-supervision chain before that point.  mallopt()
+        is the runtime-safe equivalent and must be called explicitly from run().
+        """
+        import frigate.embeddings as emb_module
+
+        # Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
+        with patch.object(
+            emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
+        ), patch.object(
+            emb_module.EmbeddingProcess, "pre_run_setup"
+        ), patch(
+            "ctypes.CDLL"
+        ) as mock_cdll:
+            mock_libc = MagicMock()
+            mock_cdll.return_value = mock_libc
+
+            process = emb_module.EmbeddingProcess.__new__(
+                emb_module.EmbeddingProcess
+            )
+            process.config = MagicMock()
+            process.metrics = MagicMock()
+            process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
+
+            try:
+                process.run()
+            except RuntimeError:
+                pass
+
+            mock_cdll.assert_called_with("libc.so.6")
+            mock_libc.mallopt.assert_called_once()
+            args = mock_libc.mallopt.call_args[0]
+            self.assertEqual(
+                args[0],
+                -8,  # M_ARENA_MAX
+                "mallopt must be called with M_ARENA_MAX (-8)",
+            )
+
+
+class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
+    """When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
+    inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_no_gpu_mem_limit_key_when_cuda_query_fails(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        mock_session.return_value.get_inputs.return_value = []
+        mock_session.return_value.get_outputs.return_value = []
+
+        get_optimized_runner(
+            "/fake/jina.onnx",
+            device="GPU",
+            model_type=EnrichmentModelTypeEnum.jina_v2.value,
+        )
+
+        provider_opts = mock_session.call_args.kwargs["provider_options"]
+        self.assertNotIn(
+            "gpu_mem_limit",
+            provider_opts[0],
+            "Must omit (not set to 0, not set to a guess) when query fails",
+        )
+
+
+class TestRunnerInjectsGpuMemLimitOnCudaQuerySuccess(unittest.TestCase):
+    """Positive counterpart to TestRunnerOmitsGpuMemLimitOnCudaQueryFailure:
+    when cudaMemGetInfo succeeds, gpu_mem_limit must be injected into
+    provider_options so ORT's BFC arena is bounded."""
+
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_gpu_mem_limit_key_present_when_cuda_query_succeeds(
+        self, _gs, mock_cdll, _rknn, _gp, mock_session
+    ):
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
+        mock_session.return_value.get_inputs.return_value = []
+        mock_session.return_value.get_outputs.return_value = []
+
+        get_optimized_runner(
+            "/fake/jina.onnx",
+            device="GPU",
+            model_type=EnrichmentModelTypeEnum.jina_v2.value,
+        )
+
+        provider_opts = mock_session.call_args.kwargs["provider_options"]
+        self.assertIn("gpu_mem_limit", provider_opts[0])
+        self.assertGreater(provider_opts[0]["gpu_mem_limit"], 0)
+
+
+class TestCudaGraphFallbackLogsException(unittest.TestCase):
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_exception_text(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Concern #1: the bare `except Exception:` swallowed the underlying
+        # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
+        # turning a debuggable failure into an opaque "fell back to ONNX runner".
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("cudaErrorStreamCaptureUnsupported"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn("CUDA graph capture failed", joined)
+        self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
+
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_developer_context(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Guards the enriched warning fields (model_type, device_id, providers)
+        # so a future revert to the bare "model_path + e" form is caught.
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("boom"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn(f"model_type={ModelTypeEnum.yologeneric.value}", joined)
+        self.assertIn("path=/m/yolo.onnx", joined)
+        self.assertIn("device_id=0", joined)
+        self.assertIn("CUDAExecutionProvider", joined)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@ -1,5 +1,6 @@
 """Model Utils"""

+import ctypes
 import logging
 import os
 from typing import Any
@ -283,6 +284,56 @@ def post_process_yolox(
 ### ONNX Utilities


+def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
+    """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
+
+    For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
+    requires all intermediate tensors to be live simultaneously, so peak GPU memory
+    is 15-20x the model file size and cannot be safely capped.  This function is
+    intended for embedding ONNXModelRunner sessions only.
+
+    Returns a limit derived from:
+    - min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
+
+    Returns None if the CUDA runtime query fails. The caller MUST then omit
+    gpu_mem_limit from provider_options so ORT falls back to its own default
+    (grow-as-needed up to device capacity).
+
+    Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
+    but that number is wrong for both ends of the spectrum:
+      - On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
+        any container where /dev/nvidia* passthrough is broken, asking for 4 GB
+        causes ORT session init to fail with cudaErrorMemoryAllocation.
+      - On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
+        starves the session and forces extra arena reallocations.
+    Returning None and letting ORT manage the arena itself is the
+    least-surprising behavior when we cannot actually measure VRAM. The
+    leak vectors this PR addresses (mem_pattern, mallopt) are independent
+    of the BFC arena cap, so dropping the cap on the failure path does
+    not reintroduce the leak.
+    """
+    try:
+        libcudart = ctypes.CDLL("libcudart.so")
+        free_bytes = ctypes.c_size_t()
+        total_bytes = ctypes.c_size_t()
+        rc = libcudart.cudaMemGetInfo(
+            ctypes.byref(free_bytes), ctypes.byref(total_bytes)
+        )
+        if rc != 0 or total_bytes.value == 0:
+            raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
+        total = total_bytes.value
+        free = free_bytes.value
+    except Exception as e:
+        logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
+        return None
+
+    peak_multiplier = 14 if cuda_graph else 7
+    desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
+    # Honor free VRAM so co-resident embedding sessions (jina text + vision,
+    # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
+    return min(desired, int(total * 0.80), int(free * 0.90))
+
+
 def get_ort_providers(
    force_cpu: bool = False,
    device: str | None = "AUTO",