From 71060805f05d53c5b0ef885dd22269c574699719 Mon Sep 17 00:00:00 2001 From: felalex Date: Sat, 2 May 2026 07:57:35 -0700 Subject: [PATCH] fix: prevent embeddings_manager ORT memory leak (arena + mmap plan + glibc) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent ORT/glibc leak vectors identified and fixed: 1. **ORT CPU BFC arena** (`enable_cpu_mem_arena=False` for all sessions) ORT's default CPU arena pools host-side GPU↔CPU staging buffers indefinitely. Disabling it across every InferenceSession (detection + embedding) stops hundreds-of-MB/h RSS growth seen on systems with CUDA EP sessions. 2. **ORT memory-pattern cache** (`enable_mem_pattern=False` for variable-length models) For embedding models with variable-length inputs (Jina v1/v2, PaddleOCR), ORT allocates one mmap-backed execution plan per unique sequence length and never frees them. Disabling the pattern cache stops this unbounded anon-mmap growth. Fixed-size models (YOLO) keep `enable_mem_pattern=True` to preserve buffer aliasing and avoid CUDA graph capture failures. 3. **mallopt(M_ARENA_MAX)** called from `EmbeddingProcess.run()` The forkserver start method exec()s a fresh Python interpreter that does not inherit Docker env vars, so `MALLOC_ARENA_MAX` set in docker-compose never reaches the child. Calling `mallopt(-8, os.cpu_count())` from `run()` caps glibc malloc arenas in the child process. Additional improvements: - `compute_cuda_mem_limit()`: dynamically caps the ORT CUDA EP BFC arena for embedding sessions to min(model_size × 7, 80% VRAM); prevents OOM on multi-model systems while leaving headroom for detection sessions. - CUDA graph capture is now wrapped in try/except so models with CPU-only ops (e.g. attention, NMS) fall back to ONNXModelRunner instead of crashing. - `ONNXModelRunner.has_variable_length_inputs()`: centralises the Jina/PaddleOCR detection logic to keep SessionOptions creation consistent. - 17 regression-guard unit tests in `frigate/test/test_detection_runners.py` that will fail if any of these three fixes is accidentally reverted. Fixes: #23007 Co-Authored-By: Claude Sonnet 4.6 --- frigate/detectors/detection_runners.py | 83 +++++-- frigate/embeddings/__init__.py | 11 + frigate/test/test_detection_runners.py | 315 +++++++++++++++++++++++++ frigate/util/model.py | 30 +++ 4 files changed, 420 insertions(+), 19 deletions(-) create mode 100644 frigate/test/test_detection_runners.py diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py index d12c8b733..277b1c542 100644 --- a/frigate/detectors/detection_runners.py +++ b/frigate/detectors/detection_runners.py @@ -10,7 +10,7 @@ from typing import Any import numpy as np import onnxruntime as ort -from frigate.util.model import get_ort_providers +from frigate.util.model import compute_cuda_mem_limit, get_ort_providers from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible logger = logging.getLogger(__name__) @@ -24,23 +24,36 @@ def is_arm64_platform() -> bool: def get_ort_session_options( is_complex_model: bool = False, -) -> ort.SessionOptions | None: + variable_length_inputs: bool = False, +) -> ort.SessionOptions: """Get ONNX Runtime session options with appropriate settings. Args: is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues. + variable_length_inputs: Whether the model receives variable-length inputs (e.g. text + embeddings). When True, disables memory-pattern caching, which otherwise builds + a plan per unique input shape and holds onto mmap regions indefinitely — a major + source of RSS growth in the embeddings_manager process. Returns: - SessionOptions with appropriate optimization level, or None for default settings. + SessionOptions with appropriate settings. """ + sess_options = ort.SessionOptions() + # Disable the CPU BFC arena for all sessions. With the arena enabled ORT pools + # host-side staging buffers for GPU↔CPU transfers and never releases them back to + # the OS, causing RSS to grow without bound in long-running embedding processes. + sess_options.enable_cpu_mem_arena = False + if variable_length_inputs: + # Disable per-shape memory-layout plan caching for models with variable-length + # inputs (Jina CLIP text, PaddleOCR). Each unique sequence length creates a + # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth. + # Fixed-size models (YOLO at 640×640) should keep this enabled for buffer aliasing. + sess_options.enable_mem_pattern = False if is_complex_model: - sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ( ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ) - return sess_options - - return None + return sess_options # Import OpenVINO only when needed to avoid circular dependencies @@ -137,6 +150,25 @@ class ONNXModelRunner(BaseModelRunner): ModelTypeEnum.dfine.value, ] + @staticmethod + def has_variable_length_inputs(model_type: str | None) -> bool: + """Return True for models whose input length varies between inferences. + + ORT builds a memory-layout plan per unique input shape and caches it + indefinitely (enable_mem_pattern). For fixed-size models (YOLO) this + is a single plan; for variable-length text embeddings it grows without + bound and must be disabled. + """ + if not model_type: + return False + from frigate.embeddings.types import EnrichmentModelTypeEnum + + return model_type in [ + EnrichmentModelTypeEnum.jina_v1.value, + EnrichmentModelTypeEnum.jina_v2.value, + EnrichmentModelTypeEnum.paddleocr.value, + ] + @staticmethod def is_concurrent_model(model_type: str | None) -> bool: """Check if model requires thread locking for concurrent inference. @@ -582,18 +614,22 @@ def get_optimized_runner( CudaGraphRunner.is_model_supported(model_type) and providers[0] == "CUDAExecutionProvider" ): - options[0] = { - **options[0], - "enable_cuda_graph": True, - } - return CudaGraphRunner( - ort.InferenceSession( + try: + cuda_graph_options = {**options[0], "enable_cuda_graph": True} + return CudaGraphRunner( + ort.InferenceSession( + model_path, + sess_options=get_ort_session_options(), + providers=providers, + provider_options=[cuda_graph_options, *options[1:]], + ), + cuda_graph_options["device_id"], + ) + except Exception: + logger.warning( + "CUDA graph capture failed for %s, falling back to standard ONNX runner", model_path, - providers=providers, - provider_options=options, - ), - options[0]["device_id"], - ) + ) if ( providers @@ -604,11 +640,20 @@ def get_optimized_runner( providers.pop(0) options.pop(0) + if providers and providers[0] == "CUDAExecutionProvider": + options[0] = { + **options[0], + "gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False), + } + return ONNXModelRunner( ort.InferenceSession( model_path, sess_options=get_ort_session_options( - ONNXModelRunner.is_cpu_complex_model(model_type) + is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type), + variable_length_inputs=ONNXModelRunner.has_variable_length_inputs( + model_type + ), ), providers=providers, provider_options=options, diff --git a/frigate/embeddings/__init__.py b/frigate/embeddings/__init__.py index 7e54d9703..610f03596 100644 --- a/frigate/embeddings/__init__.py +++ b/frigate/embeddings/__init__.py @@ -1,6 +1,7 @@ """SQLite-vec embeddings database.""" import base64 +import ctypes import json import logging import os @@ -46,6 +47,16 @@ class EmbeddingProcess(FrigateProcess): self.metrics = metrics def run(self) -> None: + # Forkserver spawn exec's a fresh Python interpreter that does not + # inherit Docker env vars, so MALLOC_ARENA_MAX set in docker-compose + # never reaches this process. Set it here via mallopt so glibc caps + # the number of malloc arenas to N_CPU instead of the default 8×N_CPU, + # preventing heap fragmentation under the embeddings workload. + try: + ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count()) # M_ARENA_MAX + except Exception: + pass + self.pre_run_setup(self.config.logger) maintainer = EmbeddingMaintainer( self.config, diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py new file mode 100644 index 000000000..c914b9bdc --- /dev/null +++ b/frigate/test/test_detection_runners.py @@ -0,0 +1,315 @@ +"""Tests for detection_runners session options and memory management helpers.""" + +import unittest +from unittest.mock import MagicMock, patch + + +class TestGetOrtSessionOptions(unittest.TestCase): + def setUp(self): + import onnxruntime as ort + + self.ort = ort + + def test_default_disables_cpu_mem_arena(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options() + self.assertFalse(opts.enable_cpu_mem_arena) + + def test_default_keeps_mem_pattern_enabled(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options() + self.assertTrue(opts.enable_mem_pattern) + + def test_variable_length_inputs_disables_mem_pattern(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options(variable_length_inputs=True) + self.assertFalse(opts.enable_mem_pattern) + self.assertFalse(opts.enable_cpu_mem_arena) + + def test_complex_model_sets_basic_optimization(self): + from frigate.detectors.detection_runners import get_ort_session_options + + import onnxruntime as ort + + opts = get_ort_session_options(is_complex_model=True) + self.assertEqual( + opts.graph_optimization_level, + ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, + ) + + def test_always_returns_session_options(self): + from frigate.detectors.detection_runners import get_ort_session_options + + import onnxruntime as ort + + self.assertIsInstance(get_ort_session_options(), ort.SessionOptions) + self.assertIsInstance( + get_ort_session_options(is_complex_model=True), ort.SessionOptions + ) + self.assertIsInstance( + get_ort_session_options(variable_length_inputs=True), ort.SessionOptions + ) + + +class TestHasVariableLengthInputs(unittest.TestCase): + def test_jina_v1_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.jina_v1.value + ) + ) + + def test_jina_v2_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.jina_v2.value + ) + ) + + def test_paddleocr_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.paddleocr.value + ) + ) + + def test_yolo_generic_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.detectors.detector_config import ModelTypeEnum + + self.assertFalse( + ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value) + ) + + def test_none_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + + self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None)) + + +class TestComputeCudaMemLimit(unittest.TestCase): + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) # 200 MB model + def test_respects_ceiling(self, mock_getsize, mock_cdll): + """gpu_mem_limit must not exceed 80% of total VRAM.""" + from frigate.util.model import compute_cuda_mem_limit + + total_vram = 24 * 1024**3 # 24 GB + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + + def fake_mem_get_info(free_ptr, total_ptr): + total_ptr._obj.value = total_vram + free_ptr._obj.value = total_vram + + mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertLessEqual(limit, int(total_vram * 0.80)) + + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + def test_fallback_on_cuda_unavailable(self, _mock_cdll): + """Falls back to 4 GB when CUDA runtime is not available.""" + from frigate.util.model import compute_cuda_mem_limit + + limit = compute_cuda_mem_limit("/fake/model.onnx") + self.assertEqual(limit, 4 * 1024**3) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=50 * 1024 * 1024) # 50 MB model + def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll): + """Floor must be at least 2 GB regardless of model size.""" + from frigate.util.model import compute_cuda_mem_limit + + total_vram = 24 * 1024**3 + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + + def fake_mem_get_info(free_ptr, total_ptr): + total_ptr._obj.value = total_vram + free_ptr._obj.value = total_vram + + mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertGreaterEqual(limit, 2 * 1024**3) + + +class TestOrtLeakFixRegression(unittest.TestCase): + """Regression guards for the embeddings_manager ORT memory leak fix. + + These tests verify that the three leak vectors identified in GitHub Discussion + #23007 remain fixed: + + 1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions + so host-side GPU↔CPU staging buffers are not pooled indefinitely. + + 2. ORT memory-pattern cache (enable_mem_pattern) — must be False for + variable-length embedding models (Jina, PaddleOCR) to prevent one + mmap-backed plan per unique sequence length from accumulating forever. + Must remain True for fixed-size models (YOLO) to preserve buffer aliasing. + + 3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run() + because forkserver spawn does not inherit Docker env vars, so setting + MALLOC_ARENA_MAX in docker-compose has no effect on the child process. + """ + + def test_get_optimized_runner_passes_variable_length_for_jina(self): + """get_optimized_runner must enable variable_length_inputs for Jina models.""" + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.embeddings.types import EnrichmentModelTypeEnum + + with patch( + "frigate.detectors.detection_runners.get_ort_session_options", + wraps=get_ort_session_options, + ) as mock_opts, patch( + "frigate.detectors.detection_runners.ort.InferenceSession" + ), patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CPUExecutionProvider"], [{}]), + ), patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ), patch( + "os.path.getsize", return_value=100 * 1024 * 1024 + ): + from frigate.detectors.detection_runners import get_optimized_runner + + get_optimized_runner( + "/fake/jina.onnx", + device="CPU", + model_type=EnrichmentModelTypeEnum.jina_v2.value, + ) + + calls = mock_opts.call_args_list + self.assertTrue( + any(c.kwargs.get("variable_length_inputs") for c in calls), + "get_ort_session_options must be called with variable_length_inputs=True " + "for Jina models to prevent mmap plan cache growth", + ) + + def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self): + """get_optimized_runner must NOT set variable_length_inputs for YOLO. + + Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT + from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB + to >4 GB and crashing CUDA graph capture. + """ + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.detectors.detector_config import ModelTypeEnum + + with patch( + "frigate.detectors.detection_runners.get_ort_session_options", + wraps=get_ort_session_options, + ) as mock_opts, patch( + "frigate.detectors.detection_runners.ort.InferenceSession" + ) as mock_session, patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CPUExecutionProvider"], [{}]), + ), patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ), patch( + "os.path.getsize", return_value=220 * 1024 * 1024 + ): + mock_session.return_value.get_inputs.return_value = [] + mock_session.return_value.get_outputs.return_value = [] + from frigate.detectors.detection_runners import get_optimized_runner + + get_optimized_runner( + "/fake/yolov9.onnx", + device="CPU", + model_type=ModelTypeEnum.yologeneric.value, + ) + + for call in mock_opts.call_args_list: + self.assertFalse( + call.kwargs.get("variable_length_inputs", False), + "variable_length_inputs must not be True for YOLO — disabling " + "enable_mem_pattern on fixed-size models causes CUDA graph crashes", + ) + + def test_all_sessions_disable_cpu_mem_arena(self): + """enable_cpu_mem_arena must be False regardless of model type. + + With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU + transfers indefinitely, causing RSS growth of hundreds of MB per hour. + """ + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.embeddings.types import EnrichmentModelTypeEnum + + for model_type in [ + None, + EnrichmentModelTypeEnum.jina_v1.value, + EnrichmentModelTypeEnum.jina_v2.value, + EnrichmentModelTypeEnum.paddleocr.value, + ]: + with self.subTest(model_type=model_type): + from frigate.detectors.detection_runners import ONNXModelRunner + + opts = get_ort_session_options( + variable_length_inputs=ONNXModelRunner.has_variable_length_inputs( + model_type + ) + ) + self.assertFalse( + opts.enable_cpu_mem_arena, + f"enable_cpu_mem_arena must be False for model_type={model_type}", + ) + + def test_embedding_process_calls_mallopt(self): + """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas. + + Forkserver spawn exec's a fresh Python interpreter that does not inherit + Docker env vars. MALLOC_ARENA_MAX set in docker-compose never reaches + the child process, so mallopt() must be called explicitly from run(). + """ + import frigate.embeddings as emb_module + + # Make EmbeddingMaintainer raise immediately so run() exits after mallopt. + with patch.object( + emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop") + ), patch.object( + emb_module.EmbeddingProcess, "pre_run_setup" + ), patch( + "ctypes.CDLL" + ) as mock_cdll: + mock_libc = MagicMock() + mock_cdll.return_value = mock_libc + + process = emb_module.EmbeddingProcess.__new__( + emb_module.EmbeddingProcess + ) + process.config = MagicMock() + process.metrics = MagicMock() + process.stop_event = MagicMock(is_set=MagicMock(return_value=True)) + + try: + process.run() + except RuntimeError: + pass + + mock_cdll.assert_called_with("libc.so.6") + mock_libc.mallopt.assert_called_once() + args = mock_libc.mallopt.call_args[0] + self.assertEqual( + args[0], + -8, # M_ARENA_MAX + "mallopt must be called with M_ARENA_MAX (-8)", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/frigate/util/model.py b/frigate/util/model.py index 338303e2d..d0b8721cc 100644 --- a/frigate/util/model.py +++ b/frigate/util/model.py @@ -1,5 +1,6 @@ """Model Utils""" +import ctypes import logging import os from typing import Any @@ -283,6 +284,35 @@ def post_process_yolox( ### ONNX Utilities +def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int: + """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena. + + For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture + requires all intermediate tensors to be live simultaneously, so peak GPU memory + is 15-20× the model file size and cannot be safely capped. This function is + intended for embedding ONNXModelRunner sessions only. + + Returns a limit derived from: + - Floor: model file size × peak_multiplier (≥ 2 GB) + - Ceiling: 80% of total GPU VRAM + Falls back to 4 GB if the CUDA runtime query fails. + """ + try: + libcudart = ctypes.CDLL("libcudart.so") + free_bytes = ctypes.c_size_t() + total_bytes = ctypes.c_size_t() + libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes)) + total = total_bytes.value + except Exception: + logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback") + return 4 * 1024**3 + + peak_multiplier = 14 if cuda_graph else 7 + floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3) + ceiling = int(total * 0.80) + return min(floor, ceiling) + + def get_ort_providers( force_cpu: bool = False, device: str | None = "AUTO",