From 71060805f05d53c5b0ef885dd22269c574699719 Mon Sep 17 00:00:00 2001
From: felalex <felalex@gmail.com>
Date: Sat, 2 May 2026 07:57:35 -0700
Subject: [PATCH] fix: prevent embeddings_manager ORT memory leak (arena + mmap
 plan + glibc)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three independent ORT/glibc leak vectors identified and fixed:

1. **ORT CPU BFC arena** (`enable_cpu_mem_arena=False` for all sessions)
   ORT's default CPU arena pools host-side GPU↔CPU staging buffers indefinitely.
   Disabling it across every InferenceSession (detection + embedding) stops
   hundreds-of-MB/h RSS growth seen on systems with CUDA EP sessions.

2. **ORT memory-pattern cache** (`enable_mem_pattern=False` for variable-length models)
   For embedding models with variable-length inputs (Jina v1/v2, PaddleOCR),
   ORT allocates one mmap-backed execution plan per unique sequence length and
   never frees them. Disabling the pattern cache stops this unbounded anon-mmap
   growth. Fixed-size models (YOLO) keep `enable_mem_pattern=True` to preserve
   buffer aliasing and avoid CUDA graph capture failures.

3. **mallopt(M_ARENA_MAX)** called from `EmbeddingProcess.run()`
   The forkserver start method exec()s a fresh Python interpreter that does not
   inherit Docker env vars, so `MALLOC_ARENA_MAX` set in docker-compose never
   reaches the child. Calling `mallopt(-8, os.cpu_count())` from `run()` caps
   glibc malloc arenas in the child process.

Additional improvements:
- `compute_cuda_mem_limit()`: dynamically caps the ORT CUDA EP BFC arena for
  embedding sessions to min(model_size × 7, 80% VRAM); prevents OOM on
  multi-model systems while leaving headroom for detection sessions.
- CUDA graph capture is now wrapped in try/except so models with CPU-only ops
  (e.g. attention, NMS) fall back to ONNXModelRunner instead of crashing.
- `ONNXModelRunner.has_variable_length_inputs()`: centralises the
  Jina/PaddleOCR detection logic to keep SessionOptions creation consistent.
- 17 regression-guard unit tests in `frigate/test/test_detection_runners.py`
  that will fail if any of these three fixes is accidentally reverted.

Fixes: #23007

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 frigate/detectors/detection_runners.py |  83 +++++--
 frigate/embeddings/__init__.py         |  11 +
 frigate/test/test_detection_runners.py | 315 +++++++++++++++++++++++++
 frigate/util/model.py                  |  30 +++
 4 files changed, 420 insertions(+), 19 deletions(-)
 create mode 100644 frigate/test/test_detection_runners.py

diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index d12c8b733..277b1c542 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -10,7 +10,7 @@ from typing import Any
 import numpy as np
 import onnxruntime as ort
 
-from frigate.util.model import get_ort_providers
+from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
 from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
 
 logger = logging.getLogger(__name__)
@@ -24,23 +24,36 @@ def is_arm64_platform() -> bool:
 
 def get_ort_session_options(
     is_complex_model: bool = False,
-) -> ort.SessionOptions | None:
+    variable_length_inputs: bool = False,
+) -> ort.SessionOptions:
     """Get ONNX Runtime session options with appropriate settings.
 
     Args:
         is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
+        variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
+            embeddings).  When True, disables memory-pattern caching, which otherwise builds
+            a plan per unique input shape and holds onto mmap regions indefinitely — a major
+            source of RSS growth in the embeddings_manager process.
 
     Returns:
-        SessionOptions with appropriate optimization level, or None for default settings.
+        SessionOptions with appropriate settings.
     """
+    sess_options = ort.SessionOptions()
+    # Disable the CPU BFC arena for all sessions.  With the arena enabled ORT pools
+    # host-side staging buffers for GPU↔CPU transfers and never releases them back to
+    # the OS, causing RSS to grow without bound in long-running embedding processes.
+    sess_options.enable_cpu_mem_arena = False
+    if variable_length_inputs:
+        # Disable per-shape memory-layout plan caching for models with variable-length
+        # inputs (Jina CLIP text, PaddleOCR).  Each unique sequence length creates a
+        # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
+        # Fixed-size models (YOLO at 640×640) should keep this enabled for buffer aliasing.
+        sess_options.enable_mem_pattern = False
     if is_complex_model:
-        sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = (
             ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
         )
-        return sess_options
-
-    return None
+    return sess_options
 
 
 # Import OpenVINO only when needed to avoid circular dependencies
@@ -137,6 +150,25 @@ class ONNXModelRunner(BaseModelRunner):
             ModelTypeEnum.dfine.value,
         ]
 
+    @staticmethod
+    def has_variable_length_inputs(model_type: str | None) -> bool:
+        """Return True for models whose input length varies between inferences.
+
+        ORT builds a memory-layout plan per unique input shape and caches it
+        indefinitely (enable_mem_pattern).  For fixed-size models (YOLO) this
+        is a single plan; for variable-length text embeddings it grows without
+        bound and must be disabled.
+        """
+        if not model_type:
+            return False
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        return model_type in [
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]
+
     @staticmethod
     def is_concurrent_model(model_type: str | None) -> bool:
         """Check if model requires thread locking for concurrent inference.
@@ -582,18 +614,22 @@ def get_optimized_runner(
         CudaGraphRunner.is_model_supported(model_type)
         and providers[0] == "CUDAExecutionProvider"
     ):
-        options[0] = {
-            **options[0],
-            "enable_cuda_graph": True,
-        }
-        return CudaGraphRunner(
-            ort.InferenceSession(
+        try:
+            cuda_graph_options = {**options[0], "enable_cuda_graph": True}
+            return CudaGraphRunner(
+                ort.InferenceSession(
+                    model_path,
+                    sess_options=get_ort_session_options(),
+                    providers=providers,
+                    provider_options=[cuda_graph_options, *options[1:]],
+                ),
+                cuda_graph_options["device_id"],
+            )
+        except Exception:
+            logger.warning(
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
                 model_path,
-                providers=providers,
-                provider_options=options,
-            ),
-            options[0]["device_id"],
-        )
+            )
 
     if (
         providers
@@ -604,11 +640,20 @@ def get_optimized_runner(
         providers.pop(0)
         options.pop(0)
 
+    if providers and providers[0] == "CUDAExecutionProvider":
+        options[0] = {
+            **options[0],
+            "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
+        }
+
     return ONNXModelRunner(
         ort.InferenceSession(
             model_path,
             sess_options=get_ort_session_options(
-                ONNXModelRunner.is_cpu_complex_model(model_type)
+                is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
+                variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                    model_type
+                ),
             ),
             providers=providers,
             provider_options=options,
diff --git a/frigate/embeddings/__init__.py b/frigate/embeddings/__init__.py
index 7e54d9703..610f03596 100644
--- a/frigate/embeddings/__init__.py
+++ b/frigate/embeddings/__init__.py
@@ -1,6 +1,7 @@
 """SQLite-vec embeddings database."""
 
 import base64
+import ctypes
 import json
 import logging
 import os
@@ -46,6 +47,16 @@ class EmbeddingProcess(FrigateProcess):
         self.metrics = metrics
 
     def run(self) -> None:
+        # Forkserver spawn exec's a fresh Python interpreter that does not
+        # inherit Docker env vars, so MALLOC_ARENA_MAX set in docker-compose
+        # never reaches this process.  Set it here via mallopt so glibc caps
+        # the number of malloc arenas to N_CPU instead of the default 8×N_CPU,
+        # preventing heap fragmentation under the embeddings workload.
+        try:
+            ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count())  # M_ARENA_MAX
+        except Exception:
+            pass
+
         self.pre_run_setup(self.config.logger)
         maintainer = EmbeddingMaintainer(
             self.config,
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
new file mode 100644
index 000000000..c914b9bdc
--- /dev/null
+++ b/frigate/test/test_detection_runners.py
@@ -0,0 +1,315 @@
+"""Tests for detection_runners session options and memory management helpers."""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+
+class TestGetOrtSessionOptions(unittest.TestCase):
+    def setUp(self):
+        import onnxruntime as ort
+
+        self.ort = ort
+
+    def test_default_disables_cpu_mem_arena(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_default_keeps_mem_pattern_enabled(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options()
+        self.assertTrue(opts.enable_mem_pattern)
+
+    def test_variable_length_inputs_disables_mem_pattern(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        opts = get_ort_session_options(variable_length_inputs=True)
+        self.assertFalse(opts.enable_mem_pattern)
+        self.assertFalse(opts.enable_cpu_mem_arena)
+
+    def test_complex_model_sets_basic_optimization(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        opts = get_ort_session_options(is_complex_model=True)
+        self.assertEqual(
+            opts.graph_optimization_level,
+            ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+        )
+
+    def test_always_returns_session_options(self):
+        from frigate.detectors.detection_runners import get_ort_session_options
+
+        import onnxruntime as ort
+
+        self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
+        self.assertIsInstance(
+            get_ort_session_options(is_complex_model=True), ort.SessionOptions
+        )
+        self.assertIsInstance(
+            get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
+        )
+
+
+class TestHasVariableLengthInputs(unittest.TestCase):
+    def test_jina_v1_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v1.value
+            )
+        )
+
+    def test_jina_v2_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.jina_v2.value
+            )
+        )
+
+    def test_paddleocr_is_variable(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        self.assertTrue(
+            ONNXModelRunner.has_variable_length_inputs(
+                EnrichmentModelTypeEnum.paddleocr.value
+            )
+        )
+
+    def test_yolo_generic_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        self.assertFalse(
+            ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
+        )
+
+    def test_none_is_fixed(self):
+        from frigate.detectors.detection_runners import ONNXModelRunner
+
+        self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
+
+
+class TestComputeCudaMemLimit(unittest.TestCase):
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
+    def test_respects_ceiling(self, mock_getsize, mock_cdll):
+        """gpu_mem_limit must not exceed 80% of total VRAM."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3  # 24 GB
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+
+        def fake_mem_get_info(free_ptr, total_ptr):
+            total_ptr._obj.value = total_vram
+            free_ptr._obj.value = total_vram
+
+        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(total_vram * 0.80))
+
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    def test_fallback_on_cuda_unavailable(self, _mock_cdll):
+        """Falls back to 4 GB when CUDA runtime is not available."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx")
+        self.assertEqual(limit, 4 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
+    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
+        """Floor must be at least 2 GB regardless of model size."""
+        from frigate.util.model import compute_cuda_mem_limit
+
+        total_vram = 24 * 1024**3
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+
+        def fake_mem_get_info(free_ptr, total_ptr):
+            total_ptr._obj.value = total_vram
+            free_ptr._obj.value = total_vram
+
+        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertGreaterEqual(limit, 2 * 1024**3)
+
+
+class TestOrtLeakFixRegression(unittest.TestCase):
+    """Regression guards for the embeddings_manager ORT memory leak fix.
+
+    These tests verify that the three leak vectors identified in GitHub Discussion
+    #23007 remain fixed:
+
+      1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions
+         so host-side GPU↔CPU staging buffers are not pooled indefinitely.
+
+      2. ORT memory-pattern cache (enable_mem_pattern) — must be False for
+         variable-length embedding models (Jina, PaddleOCR) to prevent one
+         mmap-backed plan per unique sequence length from accumulating forever.
+         Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
+
+      3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
+         because forkserver spawn does not inherit Docker env vars, so setting
+         MALLOC_ARENA_MAX in docker-compose has no effect on the child process.
+    """
+
+    def test_get_optimized_runner_passes_variable_length_for_jina(self):
+        """get_optimized_runner must enable variable_length_inputs for Jina models."""
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ), patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=100 * 1024 * 1024
+        ):
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/jina.onnx",
+                device="CPU",
+                model_type=EnrichmentModelTypeEnum.jina_v2.value,
+            )
+
+        calls = mock_opts.call_args_list
+        self.assertTrue(
+            any(c.kwargs.get("variable_length_inputs") for c in calls),
+            "get_ort_session_options must be called with variable_length_inputs=True "
+            "for Jina models to prevent mmap plan cache growth",
+        )
+
+    def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
+        """get_optimized_runner must NOT set variable_length_inputs for YOLO.
+
+        Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
+        from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
+        to >4 GB and crashing CUDA graph capture.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        with patch(
+            "frigate.detectors.detection_runners.get_ort_session_options",
+            wraps=get_ort_session_options,
+        ) as mock_opts, patch(
+            "frigate.detectors.detection_runners.ort.InferenceSession"
+        ) as mock_session, patch(
+            "frigate.detectors.detection_runners.get_ort_providers",
+            return_value=(["CPUExecutionProvider"], [{}]),
+        ), patch(
+            "frigate.detectors.detection_runners.is_rknn_compatible",
+            return_value=False,
+        ), patch(
+            "os.path.getsize", return_value=220 * 1024 * 1024
+        ):
+            mock_session.return_value.get_inputs.return_value = []
+            mock_session.return_value.get_outputs.return_value = []
+            from frigate.detectors.detection_runners import get_optimized_runner
+
+            get_optimized_runner(
+                "/fake/yolov9.onnx",
+                device="CPU",
+                model_type=ModelTypeEnum.yologeneric.value,
+            )
+
+        for call in mock_opts.call_args_list:
+            self.assertFalse(
+                call.kwargs.get("variable_length_inputs", False),
+                "variable_length_inputs must not be True for YOLO — disabling "
+                "enable_mem_pattern on fixed-size models causes CUDA graph crashes",
+            )
+
+    def test_all_sessions_disable_cpu_mem_arena(self):
+        """enable_cpu_mem_arena must be False regardless of model type.
+
+        With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU
+        transfers indefinitely, causing RSS growth of hundreds of MB per hour.
+        """
+        from frigate.detectors.detection_runners import get_ort_session_options
+        from frigate.embeddings.types import EnrichmentModelTypeEnum
+
+        for model_type in [
+            None,
+            EnrichmentModelTypeEnum.jina_v1.value,
+            EnrichmentModelTypeEnum.jina_v2.value,
+            EnrichmentModelTypeEnum.paddleocr.value,
+        ]:
+            with self.subTest(model_type=model_type):
+                from frigate.detectors.detection_runners import ONNXModelRunner
+
+                opts = get_ort_session_options(
+                    variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
+                        model_type
+                    )
+                )
+                self.assertFalse(
+                    opts.enable_cpu_mem_arena,
+                    f"enable_cpu_mem_arena must be False for model_type={model_type}",
+                )
+
+    def test_embedding_process_calls_mallopt(self):
+        """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
+
+        Forkserver spawn exec's a fresh Python interpreter that does not inherit
+        Docker env vars.  MALLOC_ARENA_MAX set in docker-compose never reaches
+        the child process, so mallopt() must be called explicitly from run().
+        """
+        import frigate.embeddings as emb_module
+
+        # Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
+        with patch.object(
+            emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
+        ), patch.object(
+            emb_module.EmbeddingProcess, "pre_run_setup"
+        ), patch(
+            "ctypes.CDLL"
+        ) as mock_cdll:
+            mock_libc = MagicMock()
+            mock_cdll.return_value = mock_libc
+
+            process = emb_module.EmbeddingProcess.__new__(
+                emb_module.EmbeddingProcess
+            )
+            process.config = MagicMock()
+            process.metrics = MagicMock()
+            process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
+
+            try:
+                process.run()
+            except RuntimeError:
+                pass
+
+            mock_cdll.assert_called_with("libc.so.6")
+            mock_libc.mallopt.assert_called_once()
+            args = mock_libc.mallopt.call_args[0]
+            self.assertEqual(
+                args[0],
+                -8,  # M_ARENA_MAX
+                "mallopt must be called with M_ARENA_MAX (-8)",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/frigate/util/model.py b/frigate/util/model.py
index 338303e2d..d0b8721cc 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -1,5 +1,6 @@
 """Model Utils"""
 
+import ctypes
 import logging
 import os
 from typing import Any
@@ -283,6 +284,35 @@ def post_process_yolox(
 ### ONNX Utilities
 
 
+def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
+    """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
+
+    For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
+    requires all intermediate tensors to be live simultaneously, so peak GPU memory
+    is 15-20× the model file size and cannot be safely capped.  This function is
+    intended for embedding ONNXModelRunner sessions only.
+
+    Returns a limit derived from:
+    - Floor: model file size × peak_multiplier (≥ 2 GB)
+    - Ceiling: 80% of total GPU VRAM
+    Falls back to 4 GB if the CUDA runtime query fails.
+    """
+    try:
+        libcudart = ctypes.CDLL("libcudart.so")
+        free_bytes = ctypes.c_size_t()
+        total_bytes = ctypes.c_size_t()
+        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        total = total_bytes.value
+    except Exception:
+        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+        return 4 * 1024**3
+
+    peak_multiplier = 14 if cuda_graph else 7
+    floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
+    ceiling = int(total * 0.80)
+    return min(floor, ceiling)
+
+
 def get_ort_providers(
     force_cpu: bool = False,
     device: str | None = "AUTO",