mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-07 05:55:27 +03:00
Merge 740f2e9b68 into 76a1230885
This commit is contained in:
commit
6e084e4aca
@ -10,7 +10,7 @@ from typing import Any
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
|
||||
from frigate.util.model import get_ort_providers
|
||||
from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
|
||||
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -24,24 +24,46 @@ def is_arm64_platform() -> bool:
|
||||
|
||||
def get_ort_session_options(
|
||||
is_complex_model: bool = False,
|
||||
) -> ort.SessionOptions | None:
|
||||
variable_length_inputs: bool = False,
|
||||
) -> ort.SessionOptions:
|
||||
"""Get ONNX Runtime session options with appropriate settings.
|
||||
|
||||
Args:
|
||||
is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
|
||||
variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
|
||||
embeddings). When True, disables memory-pattern caching, which otherwise builds
|
||||
a plan per unique input shape and holds onto mmap regions indefinitely - a major
|
||||
source of RSS growth in the embeddings_manager process.
|
||||
|
||||
Returns:
|
||||
SessionOptions with appropriate optimization level, or None for default settings.
|
||||
SessionOptions with appropriate settings.
|
||||
"""
|
||||
if is_complex_model:
|
||||
sess_options = ort.SessionOptions()
|
||||
# Disable the CPU BFC arena for all sessions. With the arena enabled ORT pools
|
||||
# host-side staging buffers for GPU -> CPU transfers and never releases them back to
|
||||
# the OS, causing RSS to grow without bound in long-running embedding processes.
|
||||
sess_options.enable_cpu_mem_arena = False
|
||||
if variable_length_inputs:
|
||||
# Disable per-shape memory-layout plan caching for models with variable-length
|
||||
# inputs (Jina CLIP text, PaddleOCR). Each unique sequence length creates a
|
||||
# new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
|
||||
sess_options.enable_mem_pattern = False
|
||||
else:
|
||||
# Fixed-size models (like YOLO ) keep mem_pattern on for buffer aliasing.
|
||||
# Set explicitly to be robust against ORT default changes.
|
||||
sess_options.enable_mem_pattern = True
|
||||
if is_complex_model:
|
||||
sess_options.graph_optimization_level = (
|
||||
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
)
|
||||
else:
|
||||
# Most models tolerate aggressive fusions; set explicitly to be robust
|
||||
# against ORT default changes.
|
||||
sess_options.graph_optimization_level = (
|
||||
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
)
|
||||
return sess_options
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# Import OpenVINO only when needed to avoid circular dependencies
|
||||
try:
|
||||
@ -136,6 +158,25 @@ class ONNXModelRunner(BaseModelRunner):
|
||||
ModelTypeEnum.dfine.value,
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def has_variable_length_inputs(model_type: str | None) -> bool:
|
||||
"""Return True for models whose input length varies between inferences.
|
||||
|
||||
ORT builds a memory-layout plan per unique input shape and caches it
|
||||
indefinitely (enable_mem_pattern). For fixed-size models (YOLO) this
|
||||
is a single plan; for variable-length text embeddings it grows without
|
||||
bound and must be disabled.
|
||||
"""
|
||||
if not model_type:
|
||||
return False
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
return model_type in [
|
||||
EnrichmentModelTypeEnum.jina_v1.value,
|
||||
EnrichmentModelTypeEnum.jina_v2.value,
|
||||
EnrichmentModelTypeEnum.paddleocr.value,
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def is_concurrent_model(model_type: str | None) -> bool:
|
||||
"""Check if model requires thread locking for concurrent inference.
|
||||
@ -581,17 +622,26 @@ def get_optimized_runner(
|
||||
CudaGraphRunner.is_model_supported(model_type)
|
||||
and providers[0] == "CUDAExecutionProvider"
|
||||
):
|
||||
options[0] = {
|
||||
**options[0],
|
||||
"enable_cuda_graph": True,
|
||||
}
|
||||
try:
|
||||
cuda_graph_options = {**options[0], "enable_cuda_graph": True}
|
||||
return CudaGraphRunner(
|
||||
ort.InferenceSession(
|
||||
model_path,
|
||||
sess_options=get_ort_session_options(),
|
||||
providers=providers,
|
||||
provider_options=options,
|
||||
provider_options=[cuda_graph_options, *options[1:]],
|
||||
),
|
||||
options[0]["device_id"],
|
||||
cuda_graph_options["device_id"],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"CUDA graph capture failed for model_type=%s path=%s "
|
||||
"device_id=%s providers=%s; falling back to standard ONNX runner: %s",
|
||||
model_type,
|
||||
model_path,
|
||||
cuda_graph_options.get("device_id"),
|
||||
providers,
|
||||
e,
|
||||
)
|
||||
|
||||
if (
|
||||
@ -603,11 +653,19 @@ def get_optimized_runner(
|
||||
providers.pop(0)
|
||||
options.pop(0)
|
||||
|
||||
if providers and providers[0] == "CUDAExecutionProvider":
|
||||
gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
|
||||
if gpu_mem_limit is not None:
|
||||
options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
|
||||
|
||||
return ONNXModelRunner(
|
||||
ort.InferenceSession(
|
||||
model_path,
|
||||
sess_options=get_ort_session_options(
|
||||
ONNXModelRunner.is_cpu_complex_model(model_type)
|
||||
is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
|
||||
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
|
||||
model_type
|
||||
),
|
||||
),
|
||||
providers=providers,
|
||||
provider_options=options,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""SQLite-vec embeddings database."""
|
||||
|
||||
import base64
|
||||
import ctypes
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -46,6 +47,19 @@ class EmbeddingProcess(FrigateProcess):
|
||||
self.metrics = metrics
|
||||
|
||||
def run(self) -> None:
|
||||
# glibc reads MALLOC_ARENA_MAX only once, at malloc init - before this
|
||||
# Python interpreter is even up. Setting it via docker-compose is
|
||||
# brittle: it has to survive the s6-overlay service-supervision chain
|
||||
# (which can filter env via s6-setuidgid/s6-envuidgid) and arrive
|
||||
# before the very first malloc call. Calling mallopt(M_ARENA_MAX, n_cpu)
|
||||
# here is the runtime equivalent and works regardless of how we were
|
||||
# spawned, capping arenas at N_CPU instead of the default 8×N_CPU and
|
||||
# preventing heap fragmentation under the embeddings workload.
|
||||
try:
|
||||
ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count()) # M_ARENA_MAX
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.pre_run_setup(self.config.logger)
|
||||
maintainer = EmbeddingMaintainer(
|
||||
self.config,
|
||||
|
||||
@ -7,6 +7,17 @@ class EmbeddingTypeEnum(str, Enum):
|
||||
|
||||
|
||||
class EnrichmentModelTypeEnum(str, Enum):
|
||||
# When adding a value, audit every classifier that switches on it:
|
||||
# - ONNXModelRunner.has_variable_length_inputs
|
||||
# - ONNXModelRunner.is_cpu_complex_model
|
||||
# - ONNXModelRunner.is_migraphx_complex_model
|
||||
# - ONNXModelRunner.is_concurrent_model
|
||||
# - CudaGraphRunner.is_model_supported
|
||||
# The default for omission is "fixed-size, simple, single-threaded" - which
|
||||
# silently re-introduces the ORT mem-pattern leak if the new model is
|
||||
# actually variable-length (Jina/PaddleOCR-class).
|
||||
# TODO: replace these scattered include-lists with a single MODEL_TRAITS
|
||||
# registry co-located with the enum so adding a value forces classification.
|
||||
arcface = "arcface"
|
||||
facenet = "facenet"
|
||||
jina_v1 = "jina_v1"
|
||||
|
||||
619
frigate/test/test_detection_runners.py
Normal file
619
frigate/test/test_detection_runners.py
Normal file
@ -0,0 +1,619 @@
|
||||
"""Tests for detection_runners session options and memory management helpers."""
|
||||
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
class TestGetOrtSessionOptions(unittest.TestCase):
|
||||
def setUp(self):
|
||||
import onnxruntime as ort
|
||||
|
||||
self.ort = ort
|
||||
|
||||
def test_default_disables_cpu_mem_arena(self):
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
opts = get_ort_session_options()
|
||||
self.assertFalse(opts.enable_cpu_mem_arena)
|
||||
|
||||
def test_default_keeps_mem_pattern_enabled(self):
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
opts = get_ort_session_options()
|
||||
self.assertTrue(opts.enable_mem_pattern)
|
||||
|
||||
def test_variable_length_inputs_disables_mem_pattern(self):
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
opts = get_ort_session_options(variable_length_inputs=True)
|
||||
self.assertFalse(opts.enable_mem_pattern)
|
||||
self.assertFalse(opts.enable_cpu_mem_arena)
|
||||
|
||||
def test_complex_model_sets_basic_optimization(self):
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
opts = get_ort_session_options(is_complex_model=True)
|
||||
self.assertEqual(
|
||||
opts.graph_optimization_level,
|
||||
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
|
||||
)
|
||||
|
||||
def test_default_sets_enable_all_optimization(self):
|
||||
# Guards the explicit `else` branch added so the optimization level is
|
||||
# never implicit — protects against ORT default changes.
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
opts = get_ort_session_options()
|
||||
self.assertEqual(
|
||||
opts.graph_optimization_level,
|
||||
ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
|
||||
)
|
||||
|
||||
def test_always_returns_session_options(self):
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
|
||||
self.assertIsInstance(
|
||||
get_ort_session_options(is_complex_model=True), ort.SessionOptions
|
||||
)
|
||||
self.assertIsInstance(
|
||||
get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
|
||||
)
|
||||
|
||||
|
||||
class TestHasVariableLengthInputs(unittest.TestCase):
|
||||
def test_jina_v1_is_variable(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertTrue(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.jina_v1.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_jina_v2_is_variable(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertTrue(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.jina_v2.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_paddleocr_is_variable(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertTrue(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.paddleocr.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_yolo_generic_is_fixed(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.detectors.detector_config import ModelTypeEnum
|
||||
|
||||
self.assertFalse(
|
||||
ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
|
||||
)
|
||||
|
||||
def test_none_is_fixed(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
|
||||
self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
|
||||
|
||||
def test_arcface_is_fixed(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertFalse(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.arcface.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_facenet_is_fixed(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertFalse(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.facenet.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_yolov9_license_plate_is_fixed(self):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
self.assertFalse(
|
||||
ONNXModelRunner.has_variable_length_inputs(
|
||||
EnrichmentModelTypeEnum.yolov9_license_plate.value
|
||||
)
|
||||
)
|
||||
|
||||
def test_every_enrichment_model_is_explicitly_classified(self):
|
||||
"""Every EnrichmentModelTypeEnum value must be deliberately classified.
|
||||
|
||||
Adding a new model to the enum without updating has_variable_length_inputs
|
||||
silently defaults it to fixed-size (mem_pattern stays on), which
|
||||
re-introduces the ORT mmap-plan leak if the new model is actually
|
||||
variable-length. This test fails on any unclassified enum value so the
|
||||
author is forced to make a deliberate decision.
|
||||
|
||||
TODO: replace this guard with a single MODEL_TRAITS registry co-located
|
||||
with EnrichmentModelTypeEnum so adding a value mechanically forces
|
||||
classification across every classifier (variable-length, cpu_complex,
|
||||
migraphx_complex, concurrent, cuda_graph_supported), not just this one.
|
||||
"""
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
VARIABLE_LENGTH = {
|
||||
EnrichmentModelTypeEnum.jina_v1,
|
||||
EnrichmentModelTypeEnum.jina_v2,
|
||||
EnrichmentModelTypeEnum.paddleocr,
|
||||
}
|
||||
FIXED_LENGTH = {
|
||||
EnrichmentModelTypeEnum.arcface,
|
||||
EnrichmentModelTypeEnum.facenet,
|
||||
EnrichmentModelTypeEnum.yolov9_license_plate,
|
||||
}
|
||||
classified = VARIABLE_LENGTH | FIXED_LENGTH
|
||||
for member in EnrichmentModelTypeEnum:
|
||||
self.assertIn(
|
||||
member,
|
||||
classified,
|
||||
f"{member.value} is not explicitly classified — audit "
|
||||
"ONNXModelRunner.has_variable_length_inputs (and the other "
|
||||
"classifiers listed in EnrichmentModelTypeEnum's docstring).",
|
||||
)
|
||||
self.assertEqual(
|
||||
ONNXModelRunner.has_variable_length_inputs(member.value),
|
||||
member in VARIABLE_LENGTH,
|
||||
f"{member.value}: classification disagrees with "
|
||||
"has_variable_length_inputs — update one or the other.",
|
||||
)
|
||||
|
||||
|
||||
class TestComputeCudaMemLimit(unittest.TestCase):
|
||||
@staticmethod
|
||||
def _fake_mem_get_info(free_value: int, total_value: int):
|
||||
def _impl(free_ptr, total_ptr):
|
||||
free_ptr._obj.value = free_value
|
||||
total_ptr._obj.value = total_value
|
||||
return 0 # cudaSuccess
|
||||
|
||||
return _impl
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_respects_ceiling(self, _mock_getsize, mock_cdll):
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
total_vram = 24 * 1024**3
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
|
||||
total_vram, total_vram
|
||||
)
|
||||
|
||||
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||
self.assertLessEqual(limit, int(total_vram * 0.80))
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||
def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
|
||||
# See compute_cuda_mem_limit docstring for the tradeoff: returning a
|
||||
# hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
|
||||
def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
total_vram = 24 * 1024**3
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
|
||||
total_vram, total_vram
|
||||
)
|
||||
|
||||
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||
self.assertGreaterEqual(limit, 2 * 1024**3)
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
|
||||
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
|
||||
# producing gpu_mem_limit=0 and immediate session OOM. We now return
|
||||
# None so the caller omits gpu_mem_limit and ORT manages the arena.
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
|
||||
|
||||
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=500 * 1024 * 1024)
|
||||
def test_cuda_graph_doubles_peak_multiplier(self, _mock_getsize, mock_cdll):
|
||||
# cuda_graph=True must use peak_multiplier=14 (vs 7 for cuda_graph=False)
|
||||
# because graph capture pins all intermediate tensors live simultaneously.
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
total_vram = 24 * 1024**3
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
|
||||
total_vram, total_vram
|
||||
)
|
||||
|
||||
model_size = 500 * 1024 * 1024
|
||||
with_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=True)
|
||||
without_graph = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||
self.assertGreaterEqual(with_graph, model_size * 14)
|
||||
self.assertGreaterEqual(without_graph, model_size * 7)
|
||||
self.assertGreater(with_graph, without_graph)
|
||||
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
|
||||
# Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
|
||||
# not 80% of total — co-resident embedding sessions would OOM otherwise.
|
||||
from frigate.util.model import compute_cuda_mem_limit
|
||||
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
|
||||
3 * 1024**3, 24 * 1024**3
|
||||
)
|
||||
|
||||
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||
self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
|
||||
|
||||
|
||||
class TestOrtLeakFixRegression(unittest.TestCase):
|
||||
"""Regression guards for the embeddings_manager ORT memory leak fix.
|
||||
|
||||
These tests verify that the three leak vectors identified in GitHub Discussion
|
||||
#23007 remain fixed:
|
||||
|
||||
1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions
|
||||
so host-side GPU↔CPU staging buffers are not pooled indefinitely.
|
||||
|
||||
2. ORT memory-pattern cache (enable_mem_pattern) — must be False for
|
||||
variable-length embedding models (Jina, PaddleOCR) to prevent one
|
||||
mmap-backed plan per unique sequence length from accumulating forever.
|
||||
Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
|
||||
|
||||
3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
|
||||
because glibc reads MALLOC_ARENA_MAX once at malloc init, and the env
|
||||
var is brittle to deliver through s6-overlay supervision before that
|
||||
point. In-process mallopt is the runtime-safe equivalent.
|
||||
"""
|
||||
|
||||
def test_get_optimized_runner_passes_variable_length_for_jina(self):
|
||||
"""get_optimized_runner must enable variable_length_inputs for Jina models."""
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
with patch(
|
||||
"frigate.detectors.detection_runners.get_ort_session_options",
|
||||
wraps=get_ort_session_options,
|
||||
) as mock_opts, patch(
|
||||
"frigate.detectors.detection_runners.ort.InferenceSession"
|
||||
), patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CPUExecutionProvider"], [{}]),
|
||||
), patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
), patch(
|
||||
"os.path.getsize", return_value=100 * 1024 * 1024
|
||||
):
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
|
||||
get_optimized_runner(
|
||||
"/fake/jina.onnx",
|
||||
device="CPU",
|
||||
model_type=EnrichmentModelTypeEnum.jina_v2.value,
|
||||
)
|
||||
|
||||
calls = mock_opts.call_args_list
|
||||
self.assertTrue(
|
||||
any(c.kwargs.get("variable_length_inputs") for c in calls),
|
||||
"get_ort_session_options must be called with variable_length_inputs=True "
|
||||
"for Jina models to prevent mmap plan cache growth",
|
||||
)
|
||||
|
||||
def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
|
||||
"""get_optimized_runner must NOT set variable_length_inputs for YOLO.
|
||||
|
||||
Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
|
||||
from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
|
||||
to >4 GB and crashing CUDA graph capture.
|
||||
"""
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
from frigate.detectors.detector_config import ModelTypeEnum
|
||||
|
||||
with patch(
|
||||
"frigate.detectors.detection_runners.get_ort_session_options",
|
||||
wraps=get_ort_session_options,
|
||||
) as mock_opts, patch(
|
||||
"frigate.detectors.detection_runners.ort.InferenceSession"
|
||||
) as mock_session, patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CPUExecutionProvider"], [{}]),
|
||||
), patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
), patch(
|
||||
"os.path.getsize", return_value=220 * 1024 * 1024
|
||||
):
|
||||
mock_session.return_value.get_inputs.return_value = []
|
||||
mock_session.return_value.get_outputs.return_value = []
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
|
||||
get_optimized_runner(
|
||||
"/fake/yolov9.onnx",
|
||||
device="CPU",
|
||||
model_type=ModelTypeEnum.yologeneric.value,
|
||||
)
|
||||
|
||||
for call in mock_opts.call_args_list:
|
||||
self.assertFalse(
|
||||
call.kwargs.get("variable_length_inputs", False),
|
||||
"variable_length_inputs must not be True for YOLO — disabling "
|
||||
"enable_mem_pattern on fixed-size models causes CUDA graph crashes",
|
||||
)
|
||||
|
||||
def test_all_sessions_disable_cpu_mem_arena(self):
|
||||
"""enable_cpu_mem_arena must be False regardless of model type.
|
||||
|
||||
With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU
|
||||
transfers indefinitely, causing RSS growth of hundreds of MB per hour.
|
||||
"""
|
||||
from frigate.detectors.detection_runners import get_ort_session_options
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
for model_type in [
|
||||
None,
|
||||
EnrichmentModelTypeEnum.jina_v1.value,
|
||||
EnrichmentModelTypeEnum.jina_v2.value,
|
||||
EnrichmentModelTypeEnum.paddleocr.value,
|
||||
]:
|
||||
with self.subTest(model_type=model_type):
|
||||
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||
|
||||
opts = get_ort_session_options(
|
||||
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
|
||||
model_type
|
||||
)
|
||||
)
|
||||
self.assertFalse(
|
||||
opts.enable_cpu_mem_arena,
|
||||
f"enable_cpu_mem_arena must be False for model_type={model_type}",
|
||||
)
|
||||
|
||||
def test_embedding_process_calls_mallopt(self):
|
||||
"""EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
|
||||
|
||||
glibc reads MALLOC_ARENA_MAX only at malloc init, before this Python
|
||||
interpreter is up, and the env var is brittle to deliver through the
|
||||
s6-overlay service-supervision chain before that point. mallopt()
|
||||
is the runtime-safe equivalent and must be called explicitly from run().
|
||||
"""
|
||||
import frigate.embeddings as emb_module
|
||||
|
||||
# Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
|
||||
with patch.object(
|
||||
emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
|
||||
), patch.object(
|
||||
emb_module.EmbeddingProcess, "pre_run_setup"
|
||||
), patch(
|
||||
"ctypes.CDLL"
|
||||
) as mock_cdll:
|
||||
mock_libc = MagicMock()
|
||||
mock_cdll.return_value = mock_libc
|
||||
|
||||
process = emb_module.EmbeddingProcess.__new__(
|
||||
emb_module.EmbeddingProcess
|
||||
)
|
||||
process.config = MagicMock()
|
||||
process.metrics = MagicMock()
|
||||
process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
|
||||
|
||||
try:
|
||||
process.run()
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
mock_cdll.assert_called_with("libc.so.6")
|
||||
mock_libc.mallopt.assert_called_once()
|
||||
args = mock_libc.mallopt.call_args[0]
|
||||
self.assertEqual(
|
||||
args[0],
|
||||
-8, # M_ARENA_MAX
|
||||
"mallopt must be called with M_ARENA_MAX (-8)",
|
||||
)
|
||||
|
||||
|
||||
class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
|
||||
"""When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
|
||||
inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
|
||||
|
||||
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
|
||||
)
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
)
|
||||
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_no_gpu_mem_limit_key_when_cuda_query_fails(
|
||||
self, _gs, _cdll, _rknn, _gp, mock_session
|
||||
):
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
mock_session.return_value.get_inputs.return_value = []
|
||||
mock_session.return_value.get_outputs.return_value = []
|
||||
|
||||
get_optimized_runner(
|
||||
"/fake/jina.onnx",
|
||||
device="GPU",
|
||||
model_type=EnrichmentModelTypeEnum.jina_v2.value,
|
||||
)
|
||||
|
||||
provider_opts = mock_session.call_args.kwargs["provider_options"]
|
||||
self.assertNotIn(
|
||||
"gpu_mem_limit",
|
||||
provider_opts[0],
|
||||
"Must omit (not set to 0, not set to a guess) when query fails",
|
||||
)
|
||||
|
||||
|
||||
class TestRunnerInjectsGpuMemLimitOnCudaQuerySuccess(unittest.TestCase):
|
||||
"""Positive counterpart to TestRunnerOmitsGpuMemLimitOnCudaQueryFailure:
|
||||
when cudaMemGetInfo succeeds, gpu_mem_limit must be injected into
|
||||
provider_options so ORT's BFC arena is bounded."""
|
||||
|
||||
@staticmethod
|
||||
def _fake_mem_get_info(free_value: int, total_value: int):
|
||||
def _impl(free_ptr, total_ptr):
|
||||
free_ptr._obj.value = free_value
|
||||
total_ptr._obj.value = total_value
|
||||
return 0 # cudaSuccess
|
||||
|
||||
return _impl
|
||||
|
||||
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
|
||||
)
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
)
|
||||
@patch("frigate.util.model.ctypes.CDLL")
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_gpu_mem_limit_key_present_when_cuda_query_succeeds(
|
||||
self, _gs, mock_cdll, _rknn, _gp, mock_session
|
||||
):
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||
|
||||
total_vram = 24 * 1024**3
|
||||
mock_lib = MagicMock()
|
||||
mock_cdll.return_value = mock_lib
|
||||
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
|
||||
total_vram, total_vram
|
||||
)
|
||||
mock_session.return_value.get_inputs.return_value = []
|
||||
mock_session.return_value.get_outputs.return_value = []
|
||||
|
||||
get_optimized_runner(
|
||||
"/fake/jina.onnx",
|
||||
device="GPU",
|
||||
model_type=EnrichmentModelTypeEnum.jina_v2.value,
|
||||
)
|
||||
|
||||
provider_opts = mock_session.call_args.kwargs["provider_options"]
|
||||
self.assertIn("gpu_mem_limit", provider_opts[0])
|
||||
self.assertGreater(provider_opts[0]["gpu_mem_limit"], 0)
|
||||
|
||||
|
||||
class TestCudaGraphFallbackLogsException(unittest.TestCase):
|
||||
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
|
||||
)
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
)
|
||||
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_fallback_warning_includes_exception_text(
|
||||
self, _gs, _cdll, _rknn, _gp, mock_session
|
||||
):
|
||||
# Concern #1: the bare `except Exception:` swallowed the underlying
|
||||
# ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
|
||||
# turning a debuggable failure into an opaque "fell back to ONNX runner".
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
from frigate.detectors.detector_config import ModelTypeEnum
|
||||
|
||||
mock_session.side_effect = [
|
||||
RuntimeError("cudaErrorStreamCaptureUnsupported"),
|
||||
MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
|
||||
]
|
||||
|
||||
with self.assertLogs(
|
||||
"frigate.detectors.detection_runners", level="WARNING"
|
||||
) as captured:
|
||||
get_optimized_runner(
|
||||
"/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
|
||||
)
|
||||
|
||||
joined = "\n".join(captured.output)
|
||||
self.assertIn("CUDA graph capture failed", joined)
|
||||
self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
|
||||
|
||||
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.get_ort_providers",
|
||||
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
|
||||
)
|
||||
@patch(
|
||||
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||
return_value=False,
|
||||
)
|
||||
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||
def test_fallback_warning_includes_developer_context(
|
||||
self, _gs, _cdll, _rknn, _gp, mock_session
|
||||
):
|
||||
# Guards the enriched warning fields (model_type, device_id, providers)
|
||||
# so a future revert to the bare "model_path + e" form is caught.
|
||||
from frigate.detectors.detection_runners import get_optimized_runner
|
||||
from frigate.detectors.detector_config import ModelTypeEnum
|
||||
|
||||
mock_session.side_effect = [
|
||||
RuntimeError("boom"),
|
||||
MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
|
||||
]
|
||||
|
||||
with self.assertLogs(
|
||||
"frigate.detectors.detection_runners", level="WARNING"
|
||||
) as captured:
|
||||
get_optimized_runner(
|
||||
"/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
|
||||
)
|
||||
|
||||
joined = "\n".join(captured.output)
|
||||
self.assertIn(f"model_type={ModelTypeEnum.yologeneric.value}", joined)
|
||||
self.assertIn("path=/m/yolo.onnx", joined)
|
||||
self.assertIn("device_id=0", joined)
|
||||
self.assertIn("CUDAExecutionProvider", joined)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,5 +1,6 @@
|
||||
"""Model Utils"""
|
||||
|
||||
import ctypes
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
@ -283,6 +284,56 @@ def post_process_yolox(
|
||||
### ONNX Utilities
|
||||
|
||||
|
||||
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
|
||||
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
|
||||
|
||||
For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
|
||||
requires all intermediate tensors to be live simultaneously, so peak GPU memory
|
||||
is 15-20x the model file size and cannot be safely capped. This function is
|
||||
intended for embedding ONNXModelRunner sessions only.
|
||||
|
||||
Returns a limit derived from:
|
||||
- min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
|
||||
|
||||
Returns None if the CUDA runtime query fails. The caller MUST then omit
|
||||
gpu_mem_limit from provider_options so ORT falls back to its own default
|
||||
(grow-as-needed up to device capacity).
|
||||
|
||||
Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
|
||||
but that number is wrong for both ends of the spectrum:
|
||||
- On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
|
||||
any container where /dev/nvidia* passthrough is broken, asking for 4 GB
|
||||
causes ORT session init to fail with cudaErrorMemoryAllocation.
|
||||
- On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
|
||||
starves the session and forces extra arena reallocations.
|
||||
Returning None and letting ORT manage the arena itself is the
|
||||
least-surprising behavior when we cannot actually measure VRAM. The
|
||||
leak vectors this PR addresses (mem_pattern, mallopt) are independent
|
||||
of the BFC arena cap, so dropping the cap on the failure path does
|
||||
not reintroduce the leak.
|
||||
"""
|
||||
try:
|
||||
libcudart = ctypes.CDLL("libcudart.so")
|
||||
free_bytes = ctypes.c_size_t()
|
||||
total_bytes = ctypes.c_size_t()
|
||||
rc = libcudart.cudaMemGetInfo(
|
||||
ctypes.byref(free_bytes), ctypes.byref(total_bytes)
|
||||
)
|
||||
if rc != 0 or total_bytes.value == 0:
|
||||
raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
|
||||
total = total_bytes.value
|
||||
free = free_bytes.value
|
||||
except Exception as e:
|
||||
logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
|
||||
return None
|
||||
|
||||
peak_multiplier = 14 if cuda_graph else 7
|
||||
desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
|
||||
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
|
||||
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
|
||||
return min(desired, int(total * 0.80), int(free * 0.90))
|
||||
|
||||
|
||||
def get_ort_providers(
|
||||
force_cpu: bool = False,
|
||||
device: str | None = "AUTO",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user