diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py index 8d7eb1e67..36e8971ef 100644 --- a/frigate/detectors/detection_runners.py +++ b/frigate/detectors/detection_runners.py @@ -10,7 +10,7 @@ from typing import Any import numpy as np import onnxruntime as ort -from frigate.util.model import get_ort_providers +from frigate.util.model import compute_cuda_mem_limit, get_ort_providers from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible logger = logging.getLogger(__name__) @@ -24,23 +24,45 @@ def is_arm64_platform() -> bool: def get_ort_session_options( is_complex_model: bool = False, -) -> ort.SessionOptions | None: + variable_length_inputs: bool = False, +) -> ort.SessionOptions: """Get ONNX Runtime session options with appropriate settings. Args: is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues. + variable_length_inputs: Whether the model receives variable-length inputs (e.g. text + embeddings). When True, disables memory-pattern caching, which otherwise builds + a plan per unique input shape and holds onto mmap regions indefinitely - a major + source of RSS growth in the embeddings_manager process. Returns: - SessionOptions with appropriate optimization level, or None for default settings. + SessionOptions with appropriate settings. """ + sess_options = ort.SessionOptions() + # Disable the CPU BFC arena for all sessions. With the arena enabled ORT pools + # host-side staging buffers for GPU -> CPU transfers and never releases them back to + # the OS, causing RSS to grow without bound in long-running embedding processes. + sess_options.enable_cpu_mem_arena = False + if variable_length_inputs: + # Disable per-shape memory-layout plan caching for models with variable-length + # inputs (Jina CLIP text, PaddleOCR). Each unique sequence length creates a + # new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth. + sess_options.enable_mem_pattern = False + else: + # Fixed-size models (like YOLO ) keep mem_pattern on for buffer aliasing. + # Set explicitly to be robust against ORT default changes. + sess_options.enable_mem_pattern = True if is_complex_model: - sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ( ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ) - return sess_options - - return None + else: + # Most models tolerate aggressive fusions; set explicitly to be robust + # against ORT default changes. + sess_options.graph_optimization_level = ( + ort.GraphOptimizationLevel.ORT_ENABLE_ALL + ) + return sess_options # Import OpenVINO only when needed to avoid circular dependencies @@ -136,6 +158,25 @@ class ONNXModelRunner(BaseModelRunner): ModelTypeEnum.dfine.value, ] + @staticmethod + def has_variable_length_inputs(model_type: str | None) -> bool: + """Return True for models whose input length varies between inferences. + + ORT builds a memory-layout plan per unique input shape and caches it + indefinitely (enable_mem_pattern). For fixed-size models (YOLO) this + is a single plan; for variable-length text embeddings it grows without + bound and must be disabled. + """ + if not model_type: + return False + from frigate.embeddings.types import EnrichmentModelTypeEnum + + return model_type in [ + EnrichmentModelTypeEnum.jina_v1.value, + EnrichmentModelTypeEnum.jina_v2.value, + EnrichmentModelTypeEnum.paddleocr.value, + ] + @staticmethod def is_concurrent_model(model_type: str | None) -> bool: """Check if model requires thread locking for concurrent inference. @@ -581,18 +622,27 @@ def get_optimized_runner( CudaGraphRunner.is_model_supported(model_type) and providers[0] == "CUDAExecutionProvider" ): - options[0] = { - **options[0], - "enable_cuda_graph": True, - } - return CudaGraphRunner( - ort.InferenceSession( + try: + cuda_graph_options = {**options[0], "enable_cuda_graph": True} + return CudaGraphRunner( + ort.InferenceSession( + model_path, + sess_options=get_ort_session_options(), + providers=providers, + provider_options=[cuda_graph_options, *options[1:]], + ), + cuda_graph_options["device_id"], + ) + except Exception as e: + logger.warning( + "CUDA graph capture failed for model_type=%s path=%s " + "device_id=%s providers=%s; falling back to standard ONNX runner: %s", + model_type, model_path, - providers=providers, - provider_options=options, - ), - options[0]["device_id"], - ) + cuda_graph_options.get("device_id"), + providers, + e, + ) if ( providers @@ -603,11 +653,19 @@ def get_optimized_runner( providers.pop(0) options.pop(0) + if providers and providers[0] == "CUDAExecutionProvider": + gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False) + if gpu_mem_limit is not None: + options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit} + return ONNXModelRunner( ort.InferenceSession( model_path, sess_options=get_ort_session_options( - ONNXModelRunner.is_cpu_complex_model(model_type) + is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type), + variable_length_inputs=ONNXModelRunner.has_variable_length_inputs( + model_type + ), ), providers=providers, provider_options=options, diff --git a/frigate/embeddings/__init__.py b/frigate/embeddings/__init__.py index 7e54d9703..4f964343c 100644 --- a/frigate/embeddings/__init__.py +++ b/frigate/embeddings/__init__.py @@ -1,6 +1,7 @@ """SQLite-vec embeddings database.""" import base64 +import ctypes import json import logging import os @@ -46,6 +47,19 @@ class EmbeddingProcess(FrigateProcess): self.metrics = metrics def run(self) -> None: + # glibc reads MALLOC_ARENA_MAX only once, at malloc init - before this + # Python interpreter is even up. Setting it via docker-compose is + # brittle: it has to survive the s6-overlay service-supervision chain + # (which can filter env via s6-setuidgid/s6-envuidgid) and arrive + # before the very first malloc call. Calling mallopt(M_ARENA_MAX, n_cpu) + # here is the runtime equivalent and works regardless of how we were + # spawned, capping arenas at N_CPU instead of the default 8×N_CPU and + # preventing heap fragmentation under the embeddings workload. + try: + ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count()) # M_ARENA_MAX + except Exception: + pass + self.pre_run_setup(self.config.logger) maintainer = EmbeddingMaintainer( self.config, diff --git a/frigate/embeddings/types.py b/frigate/embeddings/types.py index 32cbe5dd0..c66a6e2ad 100644 --- a/frigate/embeddings/types.py +++ b/frigate/embeddings/types.py @@ -7,6 +7,17 @@ class EmbeddingTypeEnum(str, Enum): class EnrichmentModelTypeEnum(str, Enum): + # When adding a value, audit every classifier that switches on it: + # - ONNXModelRunner.has_variable_length_inputs + # - ONNXModelRunner.is_cpu_complex_model + # - ONNXModelRunner.is_migraphx_complex_model + # - ONNXModelRunner.is_concurrent_model + # - CudaGraphRunner.is_model_supported + # The default for omission is "fixed-size, simple, single-threaded" - which + # silently re-introduces the ORT mem-pattern leak if the new model is + # actually variable-length (Jina/PaddleOCR-class). + # TODO: replace these scattered include-lists with a single MODEL_TRAITS + # registry co-located with the enum so adding a value forces classification. arcface = "arcface" facenet = "facenet" jina_v1 = "jina_v1" diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py new file mode 100644 index 000000000..85d90ed76 --- /dev/null +++ b/frigate/test/test_detection_runners.py @@ -0,0 +1,619 @@ +"""Tests for detection_runners session options and memory management helpers.""" + +import unittest +from unittest.mock import MagicMock, patch + + +class TestGetOrtSessionOptions(unittest.TestCase): + def setUp(self): + import onnxruntime as ort + + self.ort = ort + + def test_default_disables_cpu_mem_arena(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options() + self.assertFalse(opts.enable_cpu_mem_arena) + + def test_default_keeps_mem_pattern_enabled(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options() + self.assertTrue(opts.enable_mem_pattern) + + def test_variable_length_inputs_disables_mem_pattern(self): + from frigate.detectors.detection_runners import get_ort_session_options + + opts = get_ort_session_options(variable_length_inputs=True) + self.assertFalse(opts.enable_mem_pattern) + self.assertFalse(opts.enable_cpu_mem_arena) + + def test_complex_model_sets_basic_optimization(self): + from frigate.detectors.detection_runners import get_ort_session_options + + import onnxruntime as ort + + opts = get_ort_session_options(is_complex_model=True) + self.assertEqual( + opts.graph_optimization_level, + ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, + ) + + def test_default_sets_enable_all_optimization(self): + # Guards the explicit `else` branch added so the optimization level is + # never implicit — protects against ORT default changes. + from frigate.detectors.detection_runners import get_ort_session_options + + import onnxruntime as ort + + opts = get_ort_session_options() + self.assertEqual( + opts.graph_optimization_level, + ort.GraphOptimizationLevel.ORT_ENABLE_ALL, + ) + + def test_always_returns_session_options(self): + from frigate.detectors.detection_runners import get_ort_session_options + + import onnxruntime as ort + + self.assertIsInstance(get_ort_session_options(), ort.SessionOptions) + self.assertIsInstance( + get_ort_session_options(is_complex_model=True), ort.SessionOptions + ) + self.assertIsInstance( + get_ort_session_options(variable_length_inputs=True), ort.SessionOptions + ) + + +class TestHasVariableLengthInputs(unittest.TestCase): + def test_jina_v1_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.jina_v1.value + ) + ) + + def test_jina_v2_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.jina_v2.value + ) + ) + + def test_paddleocr_is_variable(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertTrue( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.paddleocr.value + ) + ) + + def test_yolo_generic_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.detectors.detector_config import ModelTypeEnum + + self.assertFalse( + ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value) + ) + + def test_none_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + + self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None)) + + def test_arcface_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertFalse( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.arcface.value + ) + ) + + def test_facenet_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertFalse( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.facenet.value + ) + ) + + def test_yolov9_license_plate_is_fixed(self): + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + self.assertFalse( + ONNXModelRunner.has_variable_length_inputs( + EnrichmentModelTypeEnum.yolov9_license_plate.value + ) + ) + + def test_every_enrichment_model_is_explicitly_classified(self): + """Every EnrichmentModelTypeEnum value must be deliberately classified. + + Adding a new model to the enum without updating has_variable_length_inputs + silently defaults it to fixed-size (mem_pattern stays on), which + re-introduces the ORT mmap-plan leak if the new model is actually + variable-length. This test fails on any unclassified enum value so the + author is forced to make a deliberate decision. + + TODO: replace this guard with a single MODEL_TRAITS registry co-located + with EnrichmentModelTypeEnum so adding a value mechanically forces + classification across every classifier (variable-length, cpu_complex, + migraphx_complex, concurrent, cuda_graph_supported), not just this one. + """ + from frigate.detectors.detection_runners import ONNXModelRunner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + VARIABLE_LENGTH = { + EnrichmentModelTypeEnum.jina_v1, + EnrichmentModelTypeEnum.jina_v2, + EnrichmentModelTypeEnum.paddleocr, + } + FIXED_LENGTH = { + EnrichmentModelTypeEnum.arcface, + EnrichmentModelTypeEnum.facenet, + EnrichmentModelTypeEnum.yolov9_license_plate, + } + classified = VARIABLE_LENGTH | FIXED_LENGTH + for member in EnrichmentModelTypeEnum: + self.assertIn( + member, + classified, + f"{member.value} is not explicitly classified — audit " + "ONNXModelRunner.has_variable_length_inputs (and the other " + "classifiers listed in EnrichmentModelTypeEnum's docstring).", + ) + self.assertEqual( + ONNXModelRunner.has_variable_length_inputs(member.value), + member in VARIABLE_LENGTH, + f"{member.value}: classification disagrees with " + "has_variable_length_inputs — update one or the other.", + ) + + +class TestComputeCudaMemLimit(unittest.TestCase): + @staticmethod + def _fake_mem_get_info(free_value: int, total_value: int): + def _impl(free_ptr, total_ptr): + free_ptr._obj.value = free_value + total_ptr._obj.value = total_value + return 0 # cudaSuccess + + return _impl + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_respects_ceiling(self, _mock_getsize, mock_cdll): + from frigate.util.model import compute_cuda_mem_limit + + total_vram = 24 * 1024**3 + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertLessEqual(limit, int(total_vram * 0.80)) + + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + def test_returns_none_when_cuda_unavailable(self, _mock_cdll): + # See compute_cuda_mem_limit docstring for the tradeoff: returning a + # hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620). + from frigate.util.model import compute_cuda_mem_limit + + self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx")) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=50 * 1024 * 1024) + def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll): + from frigate.util.model import compute_cuda_mem_limit + + total_vram = 24 * 1024**3 + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertGreaterEqual(limit, 2 * 1024**3) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll): + # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0, + # producing gpu_mem_limit=0 and immediate session OOM. We now return + # None so the caller omits gpu_mem_limit and ORT manages the arena. + from frigate.util.model import compute_cuda_mem_limit + + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation + + self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=500 * 1024 * 1024) + def test_cuda_graph_doubles_peak_multiplier(self, _mock_getsize, mock_cdll): + # cuda_graph=True must use peak_multiplier=14 (vs 7 for cuda_graph=False) + # because graph capture pins all intermediate tensors live simultaneously. + from frigate.util.model import compute_cuda_mem_limit + + total_vram = 24 * 1024**3 + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) + + model_size = 500 * 1024 * 1024 + with_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=True) + without_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertGreaterEqual(with_graph, model_size * 14) + self.assertGreaterEqual(without_graph, model_size * 7) + self.assertGreater(with_graph, without_graph) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll): + # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9, + # not 80% of total — co-resident embedding sessions would OOM otherwise. + from frigate.util.model import compute_cuda_mem_limit + + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + 3 * 1024**3, 24 * 1024**3 + ) + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertLessEqual(limit, int(3 * 1024**3 * 0.90)) + + +class TestOrtLeakFixRegression(unittest.TestCase): + """Regression guards for the embeddings_manager ORT memory leak fix. + + These tests verify that the three leak vectors identified in GitHub Discussion + #23007 remain fixed: + + 1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions + so host-side GPU↔CPU staging buffers are not pooled indefinitely. + + 2. ORT memory-pattern cache (enable_mem_pattern) — must be False for + variable-length embedding models (Jina, PaddleOCR) to prevent one + mmap-backed plan per unique sequence length from accumulating forever. + Must remain True for fixed-size models (YOLO) to preserve buffer aliasing. + + 3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run() + because glibc reads MALLOC_ARENA_MAX once at malloc init, and the env + var is brittle to deliver through s6-overlay supervision before that + point. In-process mallopt is the runtime-safe equivalent. + """ + + def test_get_optimized_runner_passes_variable_length_for_jina(self): + """get_optimized_runner must enable variable_length_inputs for Jina models.""" + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.embeddings.types import EnrichmentModelTypeEnum + + with patch( + "frigate.detectors.detection_runners.get_ort_session_options", + wraps=get_ort_session_options, + ) as mock_opts, patch( + "frigate.detectors.detection_runners.ort.InferenceSession" + ), patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CPUExecutionProvider"], [{}]), + ), patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ), patch( + "os.path.getsize", return_value=100 * 1024 * 1024 + ): + from frigate.detectors.detection_runners import get_optimized_runner + + get_optimized_runner( + "/fake/jina.onnx", + device="CPU", + model_type=EnrichmentModelTypeEnum.jina_v2.value, + ) + + calls = mock_opts.call_args_list + self.assertTrue( + any(c.kwargs.get("variable_length_inputs") for c in calls), + "get_ort_session_options must be called with variable_length_inputs=True " + "for Jina models to prevent mmap plan cache growth", + ) + + def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self): + """get_optimized_runner must NOT set variable_length_inputs for YOLO. + + Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT + from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB + to >4 GB and crashing CUDA graph capture. + """ + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.detectors.detector_config import ModelTypeEnum + + with patch( + "frigate.detectors.detection_runners.get_ort_session_options", + wraps=get_ort_session_options, + ) as mock_opts, patch( + "frigate.detectors.detection_runners.ort.InferenceSession" + ) as mock_session, patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CPUExecutionProvider"], [{}]), + ), patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ), patch( + "os.path.getsize", return_value=220 * 1024 * 1024 + ): + mock_session.return_value.get_inputs.return_value = [] + mock_session.return_value.get_outputs.return_value = [] + from frigate.detectors.detection_runners import get_optimized_runner + + get_optimized_runner( + "/fake/yolov9.onnx", + device="CPU", + model_type=ModelTypeEnum.yologeneric.value, + ) + + for call in mock_opts.call_args_list: + self.assertFalse( + call.kwargs.get("variable_length_inputs", False), + "variable_length_inputs must not be True for YOLO — disabling " + "enable_mem_pattern on fixed-size models causes CUDA graph crashes", + ) + + def test_all_sessions_disable_cpu_mem_arena(self): + """enable_cpu_mem_arena must be False regardless of model type. + + With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU + transfers indefinitely, causing RSS growth of hundreds of MB per hour. + """ + from frigate.detectors.detection_runners import get_ort_session_options + from frigate.embeddings.types import EnrichmentModelTypeEnum + + for model_type in [ + None, + EnrichmentModelTypeEnum.jina_v1.value, + EnrichmentModelTypeEnum.jina_v2.value, + EnrichmentModelTypeEnum.paddleocr.value, + ]: + with self.subTest(model_type=model_type): + from frigate.detectors.detection_runners import ONNXModelRunner + + opts = get_ort_session_options( + variable_length_inputs=ONNXModelRunner.has_variable_length_inputs( + model_type + ) + ) + self.assertFalse( + opts.enable_cpu_mem_arena, + f"enable_cpu_mem_arena must be False for model_type={model_type}", + ) + + def test_embedding_process_calls_mallopt(self): + """EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas. + + glibc reads MALLOC_ARENA_MAX only at malloc init, before this Python + interpreter is up, and the env var is brittle to deliver through the + s6-overlay service-supervision chain before that point. mallopt() + is the runtime-safe equivalent and must be called explicitly from run(). + """ + import frigate.embeddings as emb_module + + # Make EmbeddingMaintainer raise immediately so run() exits after mallopt. + with patch.object( + emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop") + ), patch.object( + emb_module.EmbeddingProcess, "pre_run_setup" + ), patch( + "ctypes.CDLL" + ) as mock_cdll: + mock_libc = MagicMock() + mock_cdll.return_value = mock_libc + + process = emb_module.EmbeddingProcess.__new__( + emb_module.EmbeddingProcess + ) + process.config = MagicMock() + process.metrics = MagicMock() + process.stop_event = MagicMock(is_set=MagicMock(return_value=True)) + + try: + process.run() + except RuntimeError: + pass + + mock_cdll.assert_called_with("libc.so.6") + mock_libc.mallopt.assert_called_once() + args = mock_libc.mallopt.call_args[0] + self.assertEqual( + args[0], + -8, # M_ARENA_MAX + "mallopt must be called with M_ARENA_MAX (-8)", + ) + + +class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase): + """When compute_cuda_mem_limit returns None, get_optimized_runner must NOT + inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place.""" + + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_no_gpu_mem_limit_key_when_cuda_query_fails( + self, _gs, _cdll, _rknn, _gp, mock_session + ): + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + mock_session.return_value.get_inputs.return_value = [] + mock_session.return_value.get_outputs.return_value = [] + + get_optimized_runner( + "/fake/jina.onnx", + device="GPU", + model_type=EnrichmentModelTypeEnum.jina_v2.value, + ) + + provider_opts = mock_session.call_args.kwargs["provider_options"] + self.assertNotIn( + "gpu_mem_limit", + provider_opts[0], + "Must omit (not set to 0, not set to a guess) when query fails", + ) + + +class TestRunnerInjectsGpuMemLimitOnCudaQuerySuccess(unittest.TestCase): + """Positive counterpart to TestRunnerOmitsGpuMemLimitOnCudaQueryFailure: + when cudaMemGetInfo succeeds, gpu_mem_limit must be injected into + provider_options so ORT's BFC arena is bounded.""" + + @staticmethod + def _fake_mem_get_info(free_value: int, total_value: int): + def _impl(free_ptr, total_ptr): + free_ptr._obj.value = free_value + total_ptr._obj.value = total_value + return 0 # cudaSuccess + + return _impl + + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_gpu_mem_limit_key_present_when_cuda_query_succeeds( + self, _gs, mock_cdll, _rknn, _gp, mock_session + ): + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.embeddings.types import EnrichmentModelTypeEnum + + total_vram = 24 * 1024**3 + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) + mock_session.return_value.get_inputs.return_value = [] + mock_session.return_value.get_outputs.return_value = [] + + get_optimized_runner( + "/fake/jina.onnx", + device="GPU", + model_type=EnrichmentModelTypeEnum.jina_v2.value, + ) + + provider_opts = mock_session.call_args.kwargs["provider_options"] + self.assertIn("gpu_mem_limit", provider_opts[0]) + self.assertGreater(provider_opts[0]["gpu_mem_limit"], 0) + + +class TestCudaGraphFallbackLogsException(unittest.TestCase): + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_fallback_warning_includes_exception_text( + self, _gs, _cdll, _rknn, _gp, mock_session + ): + # Concern #1: the bare `except Exception:` swallowed the underlying + # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.), + # turning a debuggable failure into an opaque "fell back to ONNX runner". + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.detectors.detector_config import ModelTypeEnum + + mock_session.side_effect = [ + RuntimeError("cudaErrorStreamCaptureUnsupported"), + MagicMock(get_inputs=lambda: [], get_outputs=lambda: []), + ] + + with self.assertLogs( + "frigate.detectors.detection_runners", level="WARNING" + ) as captured: + get_optimized_runner( + "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value + ) + + joined = "\n".join(captured.output) + self.assertIn("CUDA graph capture failed", joined) + self.assertIn("cudaErrorStreamCaptureUnsupported", joined) + + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_fallback_warning_includes_developer_context( + self, _gs, _cdll, _rknn, _gp, mock_session + ): + # Guards the enriched warning fields (model_type, device_id, providers) + # so a future revert to the bare "model_path + e" form is caught. + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.detectors.detector_config import ModelTypeEnum + + mock_session.side_effect = [ + RuntimeError("boom"), + MagicMock(get_inputs=lambda: [], get_outputs=lambda: []), + ] + + with self.assertLogs( + "frigate.detectors.detection_runners", level="WARNING" + ) as captured: + get_optimized_runner( + "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value + ) + + joined = "\n".join(captured.output) + self.assertIn(f"model_type={ModelTypeEnum.yologeneric.value}", joined) + self.assertIn("path=/m/yolo.onnx", joined) + self.assertIn("device_id=0", joined) + self.assertIn("CUDAExecutionProvider", joined) + + +if __name__ == "__main__": + unittest.main() diff --git a/frigate/util/model.py b/frigate/util/model.py index 338303e2d..ac1cfe226 100644 --- a/frigate/util/model.py +++ b/frigate/util/model.py @@ -1,5 +1,6 @@ """Model Utils""" +import ctypes import logging import os from typing import Any @@ -283,6 +284,56 @@ def post_process_yolox( ### ONNX Utilities +def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None: + """Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena. + + For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture + requires all intermediate tensors to be live simultaneously, so peak GPU memory + is 15-20x the model file size and cannot be safely capped. This function is + intended for embedding ONNXModelRunner sessions only. + + Returns a limit derived from: + - min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM) + + Returns None if the CUDA runtime query fails. The caller MUST then omit + gpu_mem_limit from provider_options so ORT falls back to its own default + (grow-as-needed up to device capacity). + + Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here, + but that number is wrong for both ends of the spectrum: + - On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and + any container where /dev/nvidia* passthrough is broken, asking for 4 GB + causes ORT session init to fail with cudaErrorMemoryAllocation. + - On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly + starves the session and forces extra arena reallocations. + Returning None and letting ORT manage the arena itself is the + least-surprising behavior when we cannot actually measure VRAM. The + leak vectors this PR addresses (mem_pattern, mallopt) are independent + of the BFC arena cap, so dropping the cap on the failure path does + not reintroduce the leak. + """ + try: + libcudart = ctypes.CDLL("libcudart.so") + free_bytes = ctypes.c_size_t() + total_bytes = ctypes.c_size_t() + rc = libcudart.cudaMemGetInfo( + ctypes.byref(free_bytes), ctypes.byref(total_bytes) + ) + if rc != 0 or total_bytes.value == 0: + raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}") + total = total_bytes.value + free = free_bytes.value + except Exception as e: + logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e) + return None + + peak_multiplier = 14 if cuda_graph else 7 + desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3) + # Honor free VRAM so co-resident embedding sessions (jina text + vision, + # paddleocr det + rec, arcface) don't OOM each other on shared GPUs. + return min(desired, int(total * 0.80), int(free * 0.90)) + + def get_ort_providers( force_cpu: bool = False, device: str | None = "AUTO",