This commit is contained in:
felalex 2026-05-05 23:36:47 +02:00 committed by GitHub
commit 6e084e4aca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 772 additions and 19 deletions

View File

@ -10,7 +10,7 @@ from typing import Any
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from frigate.util.model import get_ort_providers from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -24,24 +24,46 @@ def is_arm64_platform() -> bool:
def get_ort_session_options( def get_ort_session_options(
is_complex_model: bool = False, is_complex_model: bool = False,
) -> ort.SessionOptions | None: variable_length_inputs: bool = False,
) -> ort.SessionOptions:
"""Get ONNX Runtime session options with appropriate settings. """Get ONNX Runtime session options with appropriate settings.
Args: Args:
is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues. is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
embeddings). When True, disables memory-pattern caching, which otherwise builds
a plan per unique input shape and holds onto mmap regions indefinitely - a major
source of RSS growth in the embeddings_manager process.
Returns: Returns:
SessionOptions with appropriate optimization level, or None for default settings. SessionOptions with appropriate settings.
""" """
if is_complex_model:
sess_options = ort.SessionOptions() sess_options = ort.SessionOptions()
# Disable the CPU BFC arena for all sessions. With the arena enabled ORT pools
# host-side staging buffers for GPU -> CPU transfers and never releases them back to
# the OS, causing RSS to grow without bound in long-running embedding processes.
sess_options.enable_cpu_mem_arena = False
if variable_length_inputs:
# Disable per-shape memory-layout plan caching for models with variable-length
# inputs (Jina CLIP text, PaddleOCR). Each unique sequence length creates a
# new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
sess_options.enable_mem_pattern = False
else:
# Fixed-size models (like YOLO ) keep mem_pattern on for buffer aliasing.
# Set explicitly to be robust against ORT default changes.
sess_options.enable_mem_pattern = True
if is_complex_model:
sess_options.graph_optimization_level = ( sess_options.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
) )
else:
# Most models tolerate aggressive fusions; set explicitly to be robust
# against ORT default changes.
sess_options.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
)
return sess_options return sess_options
return None
# Import OpenVINO only when needed to avoid circular dependencies # Import OpenVINO only when needed to avoid circular dependencies
try: try:
@ -136,6 +158,25 @@ class ONNXModelRunner(BaseModelRunner):
ModelTypeEnum.dfine.value, ModelTypeEnum.dfine.value,
] ]
@staticmethod
def has_variable_length_inputs(model_type: str | None) -> bool:
"""Return True for models whose input length varies between inferences.
ORT builds a memory-layout plan per unique input shape and caches it
indefinitely (enable_mem_pattern). For fixed-size models (YOLO) this
is a single plan; for variable-length text embeddings it grows without
bound and must be disabled.
"""
if not model_type:
return False
from frigate.embeddings.types import EnrichmentModelTypeEnum
return model_type in [
EnrichmentModelTypeEnum.jina_v1.value,
EnrichmentModelTypeEnum.jina_v2.value,
EnrichmentModelTypeEnum.paddleocr.value,
]
@staticmethod @staticmethod
def is_concurrent_model(model_type: str | None) -> bool: def is_concurrent_model(model_type: str | None) -> bool:
"""Check if model requires thread locking for concurrent inference. """Check if model requires thread locking for concurrent inference.
@ -581,17 +622,26 @@ def get_optimized_runner(
CudaGraphRunner.is_model_supported(model_type) CudaGraphRunner.is_model_supported(model_type)
and providers[0] == "CUDAExecutionProvider" and providers[0] == "CUDAExecutionProvider"
): ):
options[0] = { try:
**options[0], cuda_graph_options = {**options[0], "enable_cuda_graph": True}
"enable_cuda_graph": True,
}
return CudaGraphRunner( return CudaGraphRunner(
ort.InferenceSession( ort.InferenceSession(
model_path, model_path,
sess_options=get_ort_session_options(),
providers=providers, providers=providers,
provider_options=options, provider_options=[cuda_graph_options, *options[1:]],
), ),
options[0]["device_id"], cuda_graph_options["device_id"],
)
except Exception as e:
logger.warning(
"CUDA graph capture failed for model_type=%s path=%s "
"device_id=%s providers=%s; falling back to standard ONNX runner: %s",
model_type,
model_path,
cuda_graph_options.get("device_id"),
providers,
e,
) )
if ( if (
@ -603,11 +653,19 @@ def get_optimized_runner(
providers.pop(0) providers.pop(0)
options.pop(0) options.pop(0)
if providers and providers[0] == "CUDAExecutionProvider":
gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
if gpu_mem_limit is not None:
options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
return ONNXModelRunner( return ONNXModelRunner(
ort.InferenceSession( ort.InferenceSession(
model_path, model_path,
sess_options=get_ort_session_options( sess_options=get_ort_session_options(
ONNXModelRunner.is_cpu_complex_model(model_type) is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
model_type
),
), ),
providers=providers, providers=providers,
provider_options=options, provider_options=options,

View File

@ -1,6 +1,7 @@
"""SQLite-vec embeddings database.""" """SQLite-vec embeddings database."""
import base64 import base64
import ctypes
import json import json
import logging import logging
import os import os
@ -46,6 +47,19 @@ class EmbeddingProcess(FrigateProcess):
self.metrics = metrics self.metrics = metrics
def run(self) -> None: def run(self) -> None:
# glibc reads MALLOC_ARENA_MAX only once, at malloc init - before this
# Python interpreter is even up. Setting it via docker-compose is
# brittle: it has to survive the s6-overlay service-supervision chain
# (which can filter env via s6-setuidgid/s6-envuidgid) and arrive
# before the very first malloc call. Calling mallopt(M_ARENA_MAX, n_cpu)
# here is the runtime equivalent and works regardless of how we were
# spawned, capping arenas at N_CPU instead of the default 8×N_CPU and
# preventing heap fragmentation under the embeddings workload.
try:
ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count()) # M_ARENA_MAX
except Exception:
pass
self.pre_run_setup(self.config.logger) self.pre_run_setup(self.config.logger)
maintainer = EmbeddingMaintainer( maintainer = EmbeddingMaintainer(
self.config, self.config,

View File

@ -7,6 +7,17 @@ class EmbeddingTypeEnum(str, Enum):
class EnrichmentModelTypeEnum(str, Enum): class EnrichmentModelTypeEnum(str, Enum):
# When adding a value, audit every classifier that switches on it:
# - ONNXModelRunner.has_variable_length_inputs
# - ONNXModelRunner.is_cpu_complex_model
# - ONNXModelRunner.is_migraphx_complex_model
# - ONNXModelRunner.is_concurrent_model
# - CudaGraphRunner.is_model_supported
# The default for omission is "fixed-size, simple, single-threaded" - which
# silently re-introduces the ORT mem-pattern leak if the new model is
# actually variable-length (Jina/PaddleOCR-class).
# TODO: replace these scattered include-lists with a single MODEL_TRAITS
# registry co-located with the enum so adding a value forces classification.
arcface = "arcface" arcface = "arcface"
facenet = "facenet" facenet = "facenet"
jina_v1 = "jina_v1" jina_v1 = "jina_v1"

View File

@ -0,0 +1,619 @@
"""Tests for detection_runners session options and memory management helpers."""
import unittest
from unittest.mock import MagicMock, patch
class TestGetOrtSessionOptions(unittest.TestCase):
def setUp(self):
import onnxruntime as ort
self.ort = ort
def test_default_disables_cpu_mem_arena(self):
from frigate.detectors.detection_runners import get_ort_session_options
opts = get_ort_session_options()
self.assertFalse(opts.enable_cpu_mem_arena)
def test_default_keeps_mem_pattern_enabled(self):
from frigate.detectors.detection_runners import get_ort_session_options
opts = get_ort_session_options()
self.assertTrue(opts.enable_mem_pattern)
def test_variable_length_inputs_disables_mem_pattern(self):
from frigate.detectors.detection_runners import get_ort_session_options
opts = get_ort_session_options(variable_length_inputs=True)
self.assertFalse(opts.enable_mem_pattern)
self.assertFalse(opts.enable_cpu_mem_arena)
def test_complex_model_sets_basic_optimization(self):
from frigate.detectors.detection_runners import get_ort_session_options
import onnxruntime as ort
opts = get_ort_session_options(is_complex_model=True)
self.assertEqual(
opts.graph_optimization_level,
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
)
def test_default_sets_enable_all_optimization(self):
# Guards the explicit `else` branch added so the optimization level is
# never implicit — protects against ORT default changes.
from frigate.detectors.detection_runners import get_ort_session_options
import onnxruntime as ort
opts = get_ort_session_options()
self.assertEqual(
opts.graph_optimization_level,
ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
)
def test_always_returns_session_options(self):
from frigate.detectors.detection_runners import get_ort_session_options
import onnxruntime as ort
self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
self.assertIsInstance(
get_ort_session_options(is_complex_model=True), ort.SessionOptions
)
self.assertIsInstance(
get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
)
class TestHasVariableLengthInputs(unittest.TestCase):
def test_jina_v1_is_variable(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertTrue(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.jina_v1.value
)
)
def test_jina_v2_is_variable(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertTrue(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.jina_v2.value
)
)
def test_paddleocr_is_variable(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertTrue(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.paddleocr.value
)
)
def test_yolo_generic_is_fixed(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.detectors.detector_config import ModelTypeEnum
self.assertFalse(
ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
)
def test_none_is_fixed(self):
from frigate.detectors.detection_runners import ONNXModelRunner
self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
def test_arcface_is_fixed(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertFalse(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.arcface.value
)
)
def test_facenet_is_fixed(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertFalse(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.facenet.value
)
)
def test_yolov9_license_plate_is_fixed(self):
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
self.assertFalse(
ONNXModelRunner.has_variable_length_inputs(
EnrichmentModelTypeEnum.yolov9_license_plate.value
)
)
def test_every_enrichment_model_is_explicitly_classified(self):
"""Every EnrichmentModelTypeEnum value must be deliberately classified.
Adding a new model to the enum without updating has_variable_length_inputs
silently defaults it to fixed-size (mem_pattern stays on), which
re-introduces the ORT mmap-plan leak if the new model is actually
variable-length. This test fails on any unclassified enum value so the
author is forced to make a deliberate decision.
TODO: replace this guard with a single MODEL_TRAITS registry co-located
with EnrichmentModelTypeEnum so adding a value mechanically forces
classification across every classifier (variable-length, cpu_complex,
migraphx_complex, concurrent, cuda_graph_supported), not just this one.
"""
from frigate.detectors.detection_runners import ONNXModelRunner
from frigate.embeddings.types import EnrichmentModelTypeEnum
VARIABLE_LENGTH = {
EnrichmentModelTypeEnum.jina_v1,
EnrichmentModelTypeEnum.jina_v2,
EnrichmentModelTypeEnum.paddleocr,
}
FIXED_LENGTH = {
EnrichmentModelTypeEnum.arcface,
EnrichmentModelTypeEnum.facenet,
EnrichmentModelTypeEnum.yolov9_license_plate,
}
classified = VARIABLE_LENGTH | FIXED_LENGTH
for member in EnrichmentModelTypeEnum:
self.assertIn(
member,
classified,
f"{member.value} is not explicitly classified — audit "
"ONNXModelRunner.has_variable_length_inputs (and the other "
"classifiers listed in EnrichmentModelTypeEnum's docstring).",
)
self.assertEqual(
ONNXModelRunner.has_variable_length_inputs(member.value),
member in VARIABLE_LENGTH,
f"{member.value}: classification disagrees with "
"has_variable_length_inputs — update one or the other.",
)
class TestComputeCudaMemLimit(unittest.TestCase):
@staticmethod
def _fake_mem_get_info(free_value: int, total_value: int):
def _impl(free_ptr, total_ptr):
free_ptr._obj.value = free_value
total_ptr._obj.value = total_value
return 0 # cudaSuccess
return _impl
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_respects_ceiling(self, _mock_getsize, mock_cdll):
from frigate.util.model import compute_cuda_mem_limit
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertLessEqual(limit, int(total_vram * 0.80))
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
# See compute_cuda_mem_limit docstring for the tradeoff: returning a
# hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
from frigate.util.model import compute_cuda_mem_limit
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
from frigate.util.model import compute_cuda_mem_limit
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertGreaterEqual(limit, 2 * 1024**3)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
# producing gpu_mem_limit=0 and immediate session OOM. We now return
# None so the caller omits gpu_mem_limit and ORT manages the arena.
from frigate.util.model import compute_cuda_mem_limit
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=500 * 1024 * 1024)
def test_cuda_graph_doubles_peak_multiplier(self, _mock_getsize, mock_cdll):
# cuda_graph=True must use peak_multiplier=14 (vs 7 for cuda_graph=False)
# because graph capture pins all intermediate tensors live simultaneously.
from frigate.util.model import compute_cuda_mem_limit
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
model_size = 500 * 1024 * 1024
with_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=True)
without_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertGreaterEqual(with_graph, model_size * 14)
self.assertGreaterEqual(without_graph, model_size * 7)
self.assertGreater(with_graph, without_graph)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
# Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
# not 80% of total — co-resident embedding sessions would OOM otherwise.
from frigate.util.model import compute_cuda_mem_limit
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
3 * 1024**3, 24 * 1024**3
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
class TestOrtLeakFixRegression(unittest.TestCase):
"""Regression guards for the embeddings_manager ORT memory leak fix.
These tests verify that the three leak vectors identified in GitHub Discussion
#23007 remain fixed:
1. ORT CPU BFC arena (enable_cpu_mem_arena) must be False for all sessions
so host-side GPUCPU staging buffers are not pooled indefinitely.
2. ORT memory-pattern cache (enable_mem_pattern) must be False for
variable-length embedding models (Jina, PaddleOCR) to prevent one
mmap-backed plan per unique sequence length from accumulating forever.
Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
3. mallopt(M_ARENA_MAX) must be called from inside EmbeddingProcess.run()
because glibc reads MALLOC_ARENA_MAX once at malloc init, and the env
var is brittle to deliver through s6-overlay supervision before that
point. In-process mallopt is the runtime-safe equivalent.
"""
def test_get_optimized_runner_passes_variable_length_for_jina(self):
"""get_optimized_runner must enable variable_length_inputs for Jina models."""
from frigate.detectors.detection_runners import get_ort_session_options
from frigate.embeddings.types import EnrichmentModelTypeEnum
with patch(
"frigate.detectors.detection_runners.get_ort_session_options",
wraps=get_ort_session_options,
) as mock_opts, patch(
"frigate.detectors.detection_runners.ort.InferenceSession"
), patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CPUExecutionProvider"], [{}]),
), patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
), patch(
"os.path.getsize", return_value=100 * 1024 * 1024
):
from frigate.detectors.detection_runners import get_optimized_runner
get_optimized_runner(
"/fake/jina.onnx",
device="CPU",
model_type=EnrichmentModelTypeEnum.jina_v2.value,
)
calls = mock_opts.call_args_list
self.assertTrue(
any(c.kwargs.get("variable_length_inputs") for c in calls),
"get_ort_session_options must be called with variable_length_inputs=True "
"for Jina models to prevent mmap plan cache growth",
)
def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
"""get_optimized_runner must NOT set variable_length_inputs for YOLO.
Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
to >4 GB and crashing CUDA graph capture.
"""
from frigate.detectors.detection_runners import get_ort_session_options
from frigate.detectors.detector_config import ModelTypeEnum
with patch(
"frigate.detectors.detection_runners.get_ort_session_options",
wraps=get_ort_session_options,
) as mock_opts, patch(
"frigate.detectors.detection_runners.ort.InferenceSession"
) as mock_session, patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CPUExecutionProvider"], [{}]),
), patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
), patch(
"os.path.getsize", return_value=220 * 1024 * 1024
):
mock_session.return_value.get_inputs.return_value = []
mock_session.return_value.get_outputs.return_value = []
from frigate.detectors.detection_runners import get_optimized_runner
get_optimized_runner(
"/fake/yolov9.onnx",
device="CPU",
model_type=ModelTypeEnum.yologeneric.value,
)
for call in mock_opts.call_args_list:
self.assertFalse(
call.kwargs.get("variable_length_inputs", False),
"variable_length_inputs must not be True for YOLO — disabling "
"enable_mem_pattern on fixed-size models causes CUDA graph crashes",
)
def test_all_sessions_disable_cpu_mem_arena(self):
"""enable_cpu_mem_arena must be False regardless of model type.
With the arena enabled, ORT pools CPU-side staging buffers for GPUCPU
transfers indefinitely, causing RSS growth of hundreds of MB per hour.
"""
from frigate.detectors.detection_runners import get_ort_session_options
from frigate.embeddings.types import EnrichmentModelTypeEnum
for model_type in [
None,
EnrichmentModelTypeEnum.jina_v1.value,
EnrichmentModelTypeEnum.jina_v2.value,
EnrichmentModelTypeEnum.paddleocr.value,
]:
with self.subTest(model_type=model_type):
from frigate.detectors.detection_runners import ONNXModelRunner
opts = get_ort_session_options(
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
model_type
)
)
self.assertFalse(
opts.enable_cpu_mem_arena,
f"enable_cpu_mem_arena must be False for model_type={model_type}",
)
def test_embedding_process_calls_mallopt(self):
"""EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
glibc reads MALLOC_ARENA_MAX only at malloc init, before this Python
interpreter is up, and the env var is brittle to deliver through the
s6-overlay service-supervision chain before that point. mallopt()
is the runtime-safe equivalent and must be called explicitly from run().
"""
import frigate.embeddings as emb_module
# Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
with patch.object(
emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
), patch.object(
emb_module.EmbeddingProcess, "pre_run_setup"
), patch(
"ctypes.CDLL"
) as mock_cdll:
mock_libc = MagicMock()
mock_cdll.return_value = mock_libc
process = emb_module.EmbeddingProcess.__new__(
emb_module.EmbeddingProcess
)
process.config = MagicMock()
process.metrics = MagicMock()
process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
try:
process.run()
except RuntimeError:
pass
mock_cdll.assert_called_with("libc.so.6")
mock_libc.mallopt.assert_called_once()
args = mock_libc.mallopt.call_args[0]
self.assertEqual(
args[0],
-8, # M_ARENA_MAX
"mallopt must be called with M_ARENA_MAX (-8)",
)
class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
"""When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_no_gpu_mem_limit_key_when_cuda_query_fails(
self, _gs, _cdll, _rknn, _gp, mock_session
):
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.embeddings.types import EnrichmentModelTypeEnum
mock_session.return_value.get_inputs.return_value = []
mock_session.return_value.get_outputs.return_value = []
get_optimized_runner(
"/fake/jina.onnx",
device="GPU",
model_type=EnrichmentModelTypeEnum.jina_v2.value,
)
provider_opts = mock_session.call_args.kwargs["provider_options"]
self.assertNotIn(
"gpu_mem_limit",
provider_opts[0],
"Must omit (not set to 0, not set to a guess) when query fails",
)
class TestRunnerInjectsGpuMemLimitOnCudaQuerySuccess(unittest.TestCase):
"""Positive counterpart to TestRunnerOmitsGpuMemLimitOnCudaQueryFailure:
when cudaMemGetInfo succeeds, gpu_mem_limit must be injected into
provider_options so ORT's BFC arena is bounded."""
@staticmethod
def _fake_mem_get_info(free_value: int, total_value: int):
def _impl(free_ptr, total_ptr):
free_ptr._obj.value = free_value
total_ptr._obj.value = total_value
return 0 # cudaSuccess
return _impl
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_gpu_mem_limit_key_present_when_cuda_query_succeeds(
self, _gs, mock_cdll, _rknn, _gp, mock_session
):
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.embeddings.types import EnrichmentModelTypeEnum
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
mock_session.return_value.get_inputs.return_value = []
mock_session.return_value.get_outputs.return_value = []
get_optimized_runner(
"/fake/jina.onnx",
device="GPU",
model_type=EnrichmentModelTypeEnum.jina_v2.value,
)
provider_opts = mock_session.call_args.kwargs["provider_options"]
self.assertIn("gpu_mem_limit", provider_opts[0])
self.assertGreater(provider_opts[0]["gpu_mem_limit"], 0)
class TestCudaGraphFallbackLogsException(unittest.TestCase):
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_fallback_warning_includes_exception_text(
self, _gs, _cdll, _rknn, _gp, mock_session
):
# Concern #1: the bare `except Exception:` swallowed the underlying
# ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
# turning a debuggable failure into an opaque "fell back to ONNX runner".
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.detectors.detector_config import ModelTypeEnum
mock_session.side_effect = [
RuntimeError("cudaErrorStreamCaptureUnsupported"),
MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
]
with self.assertLogs(
"frigate.detectors.detection_runners", level="WARNING"
) as captured:
get_optimized_runner(
"/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
)
joined = "\n".join(captured.output)
self.assertIn("CUDA graph capture failed", joined)
self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_fallback_warning_includes_developer_context(
self, _gs, _cdll, _rknn, _gp, mock_session
):
# Guards the enriched warning fields (model_type, device_id, providers)
# so a future revert to the bare "model_path + e" form is caught.
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.detectors.detector_config import ModelTypeEnum
mock_session.side_effect = [
RuntimeError("boom"),
MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
]
with self.assertLogs(
"frigate.detectors.detection_runners", level="WARNING"
) as captured:
get_optimized_runner(
"/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
)
joined = "\n".join(captured.output)
self.assertIn(f"model_type={ModelTypeEnum.yologeneric.value}", joined)
self.assertIn("path=/m/yolo.onnx", joined)
self.assertIn("device_id=0", joined)
self.assertIn("CUDAExecutionProvider", joined)
if __name__ == "__main__":
unittest.main()

View File

@ -1,5 +1,6 @@
"""Model Utils""" """Model Utils"""
import ctypes
import logging import logging
import os import os
from typing import Any from typing import Any
@ -283,6 +284,56 @@ def post_process_yolox(
### ONNX Utilities ### ONNX Utilities
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
requires all intermediate tensors to be live simultaneously, so peak GPU memory
is 15-20x the model file size and cannot be safely capped. This function is
intended for embedding ONNXModelRunner sessions only.
Returns a limit derived from:
- min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
Returns None if the CUDA runtime query fails. The caller MUST then omit
gpu_mem_limit from provider_options so ORT falls back to its own default
(grow-as-needed up to device capacity).
Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
but that number is wrong for both ends of the spectrum:
- On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
any container where /dev/nvidia* passthrough is broken, asking for 4 GB
causes ORT session init to fail with cudaErrorMemoryAllocation.
- On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
starves the session and forces extra arena reallocations.
Returning None and letting ORT manage the arena itself is the
least-surprising behavior when we cannot actually measure VRAM. The
leak vectors this PR addresses (mem_pattern, mallopt) are independent
of the BFC arena cap, so dropping the cap on the failure path does
not reintroduce the leak.
"""
try:
libcudart = ctypes.CDLL("libcudart.so")
free_bytes = ctypes.c_size_t()
total_bytes = ctypes.c_size_t()
rc = libcudart.cudaMemGetInfo(
ctypes.byref(free_bytes), ctypes.byref(total_bytes)
)
if rc != 0 or total_bytes.value == 0:
raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
total = total_bytes.value
free = free_bytes.value
except Exception as e:
logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
return None
peak_multiplier = 14 if cuda_graph else 7
desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
return min(desired, int(total * 0.80), int(free * 0.90))
def get_ort_providers( def get_ort_providers(
force_cpu: bool = False, force_cpu: bool = False,
device: str | None = "AUTO", device: str | None = "AUTO",