mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-09 15:05:26 +03:00
fix: prevent embeddings_manager ORT memory leak (arena + mmap plan + glibc)
Three independent ORT/glibc leak vectors identified and fixed: 1. **ORT CPU BFC arena** (`enable_cpu_mem_arena=False` for all sessions) ORT's default CPU arena pools host-side GPU↔CPU staging buffers indefinitely. Disabling it across every InferenceSession (detection + embedding) stops hundreds-of-MB/h RSS growth seen on systems with CUDA EP sessions. 2. **ORT memory-pattern cache** (`enable_mem_pattern=False` for variable-length models) For embedding models with variable-length inputs (Jina v1/v2, PaddleOCR), ORT allocates one mmap-backed execution plan per unique sequence length and never frees them. Disabling the pattern cache stops this unbounded anon-mmap growth. Fixed-size models (YOLO) keep `enable_mem_pattern=True` to preserve buffer aliasing and avoid CUDA graph capture failures. 3. **mallopt(M_ARENA_MAX)** called from `EmbeddingProcess.run()` The forkserver start method exec()s a fresh Python interpreter that does not inherit Docker env vars, so `MALLOC_ARENA_MAX` set in docker-compose never reaches the child. Calling `mallopt(-8, os.cpu_count())` from `run()` caps glibc malloc arenas in the child process. Additional improvements: - `compute_cuda_mem_limit()`: dynamically caps the ORT CUDA EP BFC arena for embedding sessions to min(model_size × 7, 80% VRAM); prevents OOM on multi-model systems while leaving headroom for detection sessions. - CUDA graph capture is now wrapped in try/except so models with CPU-only ops (e.g. attention, NMS) fall back to ONNXModelRunner instead of crashing. - `ONNXModelRunner.has_variable_length_inputs()`: centralises the Jina/PaddleOCR detection logic to keep SessionOptions creation consistent. - 17 regression-guard unit tests in `frigate/test/test_detection_runners.py` that will fail if any of these three fixes is accidentally reverted. Fixes: #23007 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
45213d0420
commit
71060805f0
@ -10,7 +10,7 @@ from typing import Any
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
|
|
||||||
from frigate.util.model import get_ort_providers
|
from frigate.util.model import compute_cuda_mem_limit, get_ort_providers
|
||||||
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
|
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -24,24 +24,37 @@ def is_arm64_platform() -> bool:
|
|||||||
|
|
||||||
def get_ort_session_options(
|
def get_ort_session_options(
|
||||||
is_complex_model: bool = False,
|
is_complex_model: bool = False,
|
||||||
) -> ort.SessionOptions | None:
|
variable_length_inputs: bool = False,
|
||||||
|
) -> ort.SessionOptions:
|
||||||
"""Get ONNX Runtime session options with appropriate settings.
|
"""Get ONNX Runtime session options with appropriate settings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
|
is_complex_model: Whether the model needs basic optimization to avoid graph fusion issues.
|
||||||
|
variable_length_inputs: Whether the model receives variable-length inputs (e.g. text
|
||||||
|
embeddings). When True, disables memory-pattern caching, which otherwise builds
|
||||||
|
a plan per unique input shape and holds onto mmap regions indefinitely — a major
|
||||||
|
source of RSS growth in the embeddings_manager process.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
SessionOptions with appropriate optimization level, or None for default settings.
|
SessionOptions with appropriate settings.
|
||||||
"""
|
"""
|
||||||
if is_complex_model:
|
|
||||||
sess_options = ort.SessionOptions()
|
sess_options = ort.SessionOptions()
|
||||||
|
# Disable the CPU BFC arena for all sessions. With the arena enabled ORT pools
|
||||||
|
# host-side staging buffers for GPU↔CPU transfers and never releases them back to
|
||||||
|
# the OS, causing RSS to grow without bound in long-running embedding processes.
|
||||||
|
sess_options.enable_cpu_mem_arena = False
|
||||||
|
if variable_length_inputs:
|
||||||
|
# Disable per-shape memory-layout plan caching for models with variable-length
|
||||||
|
# inputs (Jina CLIP text, PaddleOCR). Each unique sequence length creates a
|
||||||
|
# new mmap-backed plan that is never freed, leading to unbounded anon-mmap growth.
|
||||||
|
# Fixed-size models (YOLO at 640×640) should keep this enabled for buffer aliasing.
|
||||||
|
sess_options.enable_mem_pattern = False
|
||||||
|
if is_complex_model:
|
||||||
sess_options.graph_optimization_level = (
|
sess_options.graph_optimization_level = (
|
||||||
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||||
)
|
)
|
||||||
return sess_options
|
return sess_options
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# Import OpenVINO only when needed to avoid circular dependencies
|
# Import OpenVINO only when needed to avoid circular dependencies
|
||||||
try:
|
try:
|
||||||
@ -137,6 +150,25 @@ class ONNXModelRunner(BaseModelRunner):
|
|||||||
ModelTypeEnum.dfine.value,
|
ModelTypeEnum.dfine.value,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def has_variable_length_inputs(model_type: str | None) -> bool:
|
||||||
|
"""Return True for models whose input length varies between inferences.
|
||||||
|
|
||||||
|
ORT builds a memory-layout plan per unique input shape and caches it
|
||||||
|
indefinitely (enable_mem_pattern). For fixed-size models (YOLO) this
|
||||||
|
is a single plan; for variable-length text embeddings it grows without
|
||||||
|
bound and must be disabled.
|
||||||
|
"""
|
||||||
|
if not model_type:
|
||||||
|
return False
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
return model_type in [
|
||||||
|
EnrichmentModelTypeEnum.jina_v1.value,
|
||||||
|
EnrichmentModelTypeEnum.jina_v2.value,
|
||||||
|
EnrichmentModelTypeEnum.paddleocr.value,
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_concurrent_model(model_type: str | None) -> bool:
|
def is_concurrent_model(model_type: str | None) -> bool:
|
||||||
"""Check if model requires thread locking for concurrent inference.
|
"""Check if model requires thread locking for concurrent inference.
|
||||||
@ -582,17 +614,21 @@ def get_optimized_runner(
|
|||||||
CudaGraphRunner.is_model_supported(model_type)
|
CudaGraphRunner.is_model_supported(model_type)
|
||||||
and providers[0] == "CUDAExecutionProvider"
|
and providers[0] == "CUDAExecutionProvider"
|
||||||
):
|
):
|
||||||
options[0] = {
|
try:
|
||||||
**options[0],
|
cuda_graph_options = {**options[0], "enable_cuda_graph": True}
|
||||||
"enable_cuda_graph": True,
|
|
||||||
}
|
|
||||||
return CudaGraphRunner(
|
return CudaGraphRunner(
|
||||||
ort.InferenceSession(
|
ort.InferenceSession(
|
||||||
model_path,
|
model_path,
|
||||||
|
sess_options=get_ort_session_options(),
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=[cuda_graph_options, *options[1:]],
|
||||||
),
|
),
|
||||||
options[0]["device_id"],
|
cuda_graph_options["device_id"],
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
"CUDA graph capture failed for %s, falling back to standard ONNX runner",
|
||||||
|
model_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@ -604,11 +640,20 @@ def get_optimized_runner(
|
|||||||
providers.pop(0)
|
providers.pop(0)
|
||||||
options.pop(0)
|
options.pop(0)
|
||||||
|
|
||||||
|
if providers and providers[0] == "CUDAExecutionProvider":
|
||||||
|
options[0] = {
|
||||||
|
**options[0],
|
||||||
|
"gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
|
||||||
|
}
|
||||||
|
|
||||||
return ONNXModelRunner(
|
return ONNXModelRunner(
|
||||||
ort.InferenceSession(
|
ort.InferenceSession(
|
||||||
model_path,
|
model_path,
|
||||||
sess_options=get_ort_session_options(
|
sess_options=get_ort_session_options(
|
||||||
ONNXModelRunner.is_cpu_complex_model(model_type)
|
is_complex_model=ONNXModelRunner.is_cpu_complex_model(model_type),
|
||||||
|
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
|
||||||
|
model_type
|
||||||
|
),
|
||||||
),
|
),
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=options,
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""SQLite-vec embeddings database."""
|
"""SQLite-vec embeddings database."""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import ctypes
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -46,6 +47,16 @@ class EmbeddingProcess(FrigateProcess):
|
|||||||
self.metrics = metrics
|
self.metrics = metrics
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
|
# Forkserver spawn exec's a fresh Python interpreter that does not
|
||||||
|
# inherit Docker env vars, so MALLOC_ARENA_MAX set in docker-compose
|
||||||
|
# never reaches this process. Set it here via mallopt so glibc caps
|
||||||
|
# the number of malloc arenas to N_CPU instead of the default 8×N_CPU,
|
||||||
|
# preventing heap fragmentation under the embeddings workload.
|
||||||
|
try:
|
||||||
|
ctypes.CDLL("libc.so.6").mallopt(-8, os.cpu_count()) # M_ARENA_MAX
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
self.pre_run_setup(self.config.logger)
|
self.pre_run_setup(self.config.logger)
|
||||||
maintainer = EmbeddingMaintainer(
|
maintainer = EmbeddingMaintainer(
|
||||||
self.config,
|
self.config,
|
||||||
|
|||||||
315
frigate/test/test_detection_runners.py
Normal file
315
frigate/test/test_detection_runners.py
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
"""Tests for detection_runners session options and memory management helpers."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetOrtSessionOptions(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
self.ort = ort
|
||||||
|
|
||||||
|
def test_default_disables_cpu_mem_arena(self):
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
|
||||||
|
opts = get_ort_session_options()
|
||||||
|
self.assertFalse(opts.enable_cpu_mem_arena)
|
||||||
|
|
||||||
|
def test_default_keeps_mem_pattern_enabled(self):
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
|
||||||
|
opts = get_ort_session_options()
|
||||||
|
self.assertTrue(opts.enable_mem_pattern)
|
||||||
|
|
||||||
|
def test_variable_length_inputs_disables_mem_pattern(self):
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
|
||||||
|
opts = get_ort_session_options(variable_length_inputs=True)
|
||||||
|
self.assertFalse(opts.enable_mem_pattern)
|
||||||
|
self.assertFalse(opts.enable_cpu_mem_arena)
|
||||||
|
|
||||||
|
def test_complex_model_sets_basic_optimization(self):
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
|
||||||
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
opts = get_ort_session_options(is_complex_model=True)
|
||||||
|
self.assertEqual(
|
||||||
|
opts.graph_optimization_level,
|
||||||
|
ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_always_returns_session_options(self):
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
|
||||||
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
self.assertIsInstance(get_ort_session_options(), ort.SessionOptions)
|
||||||
|
self.assertIsInstance(
|
||||||
|
get_ort_session_options(is_complex_model=True), ort.SessionOptions
|
||||||
|
)
|
||||||
|
self.assertIsInstance(
|
||||||
|
get_ort_session_options(variable_length_inputs=True), ort.SessionOptions
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHasVariableLengthInputs(unittest.TestCase):
|
||||||
|
def test_jina_v1_is_variable(self):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
ONNXModelRunner.has_variable_length_inputs(
|
||||||
|
EnrichmentModelTypeEnum.jina_v1.value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_jina_v2_is_variable(self):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
ONNXModelRunner.has_variable_length_inputs(
|
||||||
|
EnrichmentModelTypeEnum.jina_v2.value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_paddleocr_is_variable(self):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
ONNXModelRunner.has_variable_length_inputs(
|
||||||
|
EnrichmentModelTypeEnum.paddleocr.value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_yolo_generic_is_fixed(self):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
from frigate.detectors.detector_config import ModelTypeEnum
|
||||||
|
|
||||||
|
self.assertFalse(
|
||||||
|
ONNXModelRunner.has_variable_length_inputs(ModelTypeEnum.yologeneric.value)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_none_is_fixed(self):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
|
||||||
|
self.assertFalse(ONNXModelRunner.has_variable_length_inputs(None))
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeCudaMemLimit(unittest.TestCase):
|
||||||
|
@patch("frigate.util.model.ctypes.CDLL")
|
||||||
|
@patch("os.path.getsize", return_value=200 * 1024 * 1024) # 200 MB model
|
||||||
|
def test_respects_ceiling(self, mock_getsize, mock_cdll):
|
||||||
|
"""gpu_mem_limit must not exceed 80% of total VRAM."""
|
||||||
|
from frigate.util.model import compute_cuda_mem_limit
|
||||||
|
|
||||||
|
total_vram = 24 * 1024**3 # 24 GB
|
||||||
|
mock_lib = MagicMock()
|
||||||
|
mock_cdll.return_value = mock_lib
|
||||||
|
|
||||||
|
def fake_mem_get_info(free_ptr, total_ptr):
|
||||||
|
total_ptr._obj.value = total_vram
|
||||||
|
free_ptr._obj.value = total_vram
|
||||||
|
|
||||||
|
mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
|
||||||
|
|
||||||
|
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||||
|
self.assertLessEqual(limit, int(total_vram * 0.80))
|
||||||
|
|
||||||
|
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||||
|
def test_fallback_on_cuda_unavailable(self, _mock_cdll):
|
||||||
|
"""Falls back to 4 GB when CUDA runtime is not available."""
|
||||||
|
from frigate.util.model import compute_cuda_mem_limit
|
||||||
|
|
||||||
|
limit = compute_cuda_mem_limit("/fake/model.onnx")
|
||||||
|
self.assertEqual(limit, 4 * 1024**3)
|
||||||
|
|
||||||
|
@patch("frigate.util.model.ctypes.CDLL")
|
||||||
|
@patch("os.path.getsize", return_value=50 * 1024 * 1024) # 50 MB model
|
||||||
|
def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
|
||||||
|
"""Floor must be at least 2 GB regardless of model size."""
|
||||||
|
from frigate.util.model import compute_cuda_mem_limit
|
||||||
|
|
||||||
|
total_vram = 24 * 1024**3
|
||||||
|
mock_lib = MagicMock()
|
||||||
|
mock_cdll.return_value = mock_lib
|
||||||
|
|
||||||
|
def fake_mem_get_info(free_ptr, total_ptr):
|
||||||
|
total_ptr._obj.value = total_vram
|
||||||
|
free_ptr._obj.value = total_vram
|
||||||
|
|
||||||
|
mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
|
||||||
|
|
||||||
|
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
||||||
|
self.assertGreaterEqual(limit, 2 * 1024**3)
|
||||||
|
|
||||||
|
|
||||||
|
class TestOrtLeakFixRegression(unittest.TestCase):
|
||||||
|
"""Regression guards for the embeddings_manager ORT memory leak fix.
|
||||||
|
|
||||||
|
These tests verify that the three leak vectors identified in GitHub Discussion
|
||||||
|
#23007 remain fixed:
|
||||||
|
|
||||||
|
1. ORT CPU BFC arena (enable_cpu_mem_arena) — must be False for all sessions
|
||||||
|
so host-side GPU↔CPU staging buffers are not pooled indefinitely.
|
||||||
|
|
||||||
|
2. ORT memory-pattern cache (enable_mem_pattern) — must be False for
|
||||||
|
variable-length embedding models (Jina, PaddleOCR) to prevent one
|
||||||
|
mmap-backed plan per unique sequence length from accumulating forever.
|
||||||
|
Must remain True for fixed-size models (YOLO) to preserve buffer aliasing.
|
||||||
|
|
||||||
|
3. mallopt(M_ARENA_MAX) — must be called from inside EmbeddingProcess.run()
|
||||||
|
because forkserver spawn does not inherit Docker env vars, so setting
|
||||||
|
MALLOC_ARENA_MAX in docker-compose has no effect on the child process.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_get_optimized_runner_passes_variable_length_for_jina(self):
|
||||||
|
"""get_optimized_runner must enable variable_length_inputs for Jina models."""
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"frigate.detectors.detection_runners.get_ort_session_options",
|
||||||
|
wraps=get_ort_session_options,
|
||||||
|
) as mock_opts, patch(
|
||||||
|
"frigate.detectors.detection_runners.ort.InferenceSession"
|
||||||
|
), patch(
|
||||||
|
"frigate.detectors.detection_runners.get_ort_providers",
|
||||||
|
return_value=(["CPUExecutionProvider"], [{}]),
|
||||||
|
), patch(
|
||||||
|
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||||
|
return_value=False,
|
||||||
|
), patch(
|
||||||
|
"os.path.getsize", return_value=100 * 1024 * 1024
|
||||||
|
):
|
||||||
|
from frigate.detectors.detection_runners import get_optimized_runner
|
||||||
|
|
||||||
|
get_optimized_runner(
|
||||||
|
"/fake/jina.onnx",
|
||||||
|
device="CPU",
|
||||||
|
model_type=EnrichmentModelTypeEnum.jina_v2.value,
|
||||||
|
)
|
||||||
|
|
||||||
|
calls = mock_opts.call_args_list
|
||||||
|
self.assertTrue(
|
||||||
|
any(c.kwargs.get("variable_length_inputs") for c in calls),
|
||||||
|
"get_ort_session_options must be called with variable_length_inputs=True "
|
||||||
|
"for Jina models to prevent mmap plan cache growth",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_get_optimized_runner_does_not_set_variable_length_for_yolo(self):
|
||||||
|
"""get_optimized_runner must NOT set variable_length_inputs for YOLO.
|
||||||
|
|
||||||
|
Disabling enable_mem_pattern on YOLO (fixed 640×640 input) prevents ORT
|
||||||
|
from aliasing buffers between nodes, pushing peak GPU memory from ~1.8 GB
|
||||||
|
to >4 GB and crashing CUDA graph capture.
|
||||||
|
"""
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
from frigate.detectors.detector_config import ModelTypeEnum
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"frigate.detectors.detection_runners.get_ort_session_options",
|
||||||
|
wraps=get_ort_session_options,
|
||||||
|
) as mock_opts, patch(
|
||||||
|
"frigate.detectors.detection_runners.ort.InferenceSession"
|
||||||
|
) as mock_session, patch(
|
||||||
|
"frigate.detectors.detection_runners.get_ort_providers",
|
||||||
|
return_value=(["CPUExecutionProvider"], [{}]),
|
||||||
|
), patch(
|
||||||
|
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||||
|
return_value=False,
|
||||||
|
), patch(
|
||||||
|
"os.path.getsize", return_value=220 * 1024 * 1024
|
||||||
|
):
|
||||||
|
mock_session.return_value.get_inputs.return_value = []
|
||||||
|
mock_session.return_value.get_outputs.return_value = []
|
||||||
|
from frigate.detectors.detection_runners import get_optimized_runner
|
||||||
|
|
||||||
|
get_optimized_runner(
|
||||||
|
"/fake/yolov9.onnx",
|
||||||
|
device="CPU",
|
||||||
|
model_type=ModelTypeEnum.yologeneric.value,
|
||||||
|
)
|
||||||
|
|
||||||
|
for call in mock_opts.call_args_list:
|
||||||
|
self.assertFalse(
|
||||||
|
call.kwargs.get("variable_length_inputs", False),
|
||||||
|
"variable_length_inputs must not be True for YOLO — disabling "
|
||||||
|
"enable_mem_pattern on fixed-size models causes CUDA graph crashes",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_all_sessions_disable_cpu_mem_arena(self):
|
||||||
|
"""enable_cpu_mem_arena must be False regardless of model type.
|
||||||
|
|
||||||
|
With the arena enabled, ORT pools CPU-side staging buffers for GPU↔CPU
|
||||||
|
transfers indefinitely, causing RSS growth of hundreds of MB per hour.
|
||||||
|
"""
|
||||||
|
from frigate.detectors.detection_runners import get_ort_session_options
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
for model_type in [
|
||||||
|
None,
|
||||||
|
EnrichmentModelTypeEnum.jina_v1.value,
|
||||||
|
EnrichmentModelTypeEnum.jina_v2.value,
|
||||||
|
EnrichmentModelTypeEnum.paddleocr.value,
|
||||||
|
]:
|
||||||
|
with self.subTest(model_type=model_type):
|
||||||
|
from frigate.detectors.detection_runners import ONNXModelRunner
|
||||||
|
|
||||||
|
opts = get_ort_session_options(
|
||||||
|
variable_length_inputs=ONNXModelRunner.has_variable_length_inputs(
|
||||||
|
model_type
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
opts.enable_cpu_mem_arena,
|
||||||
|
f"enable_cpu_mem_arena must be False for model_type={model_type}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_embedding_process_calls_mallopt(self):
|
||||||
|
"""EmbeddingProcess.run() must call mallopt(M_ARENA_MAX) to cap glibc arenas.
|
||||||
|
|
||||||
|
Forkserver spawn exec's a fresh Python interpreter that does not inherit
|
||||||
|
Docker env vars. MALLOC_ARENA_MAX set in docker-compose never reaches
|
||||||
|
the child process, so mallopt() must be called explicitly from run().
|
||||||
|
"""
|
||||||
|
import frigate.embeddings as emb_module
|
||||||
|
|
||||||
|
# Make EmbeddingMaintainer raise immediately so run() exits after mallopt.
|
||||||
|
with patch.object(
|
||||||
|
emb_module, "EmbeddingMaintainer", side_effect=RuntimeError("stop")
|
||||||
|
), patch.object(
|
||||||
|
emb_module.EmbeddingProcess, "pre_run_setup"
|
||||||
|
), patch(
|
||||||
|
"ctypes.CDLL"
|
||||||
|
) as mock_cdll:
|
||||||
|
mock_libc = MagicMock()
|
||||||
|
mock_cdll.return_value = mock_libc
|
||||||
|
|
||||||
|
process = emb_module.EmbeddingProcess.__new__(
|
||||||
|
emb_module.EmbeddingProcess
|
||||||
|
)
|
||||||
|
process.config = MagicMock()
|
||||||
|
process.metrics = MagicMock()
|
||||||
|
process.stop_event = MagicMock(is_set=MagicMock(return_value=True))
|
||||||
|
|
||||||
|
try:
|
||||||
|
process.run()
|
||||||
|
except RuntimeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
mock_cdll.assert_called_with("libc.so.6")
|
||||||
|
mock_libc.mallopt.assert_called_once()
|
||||||
|
args = mock_libc.mallopt.call_args[0]
|
||||||
|
self.assertEqual(
|
||||||
|
args[0],
|
||||||
|
-8, # M_ARENA_MAX
|
||||||
|
"mallopt must be called with M_ARENA_MAX (-8)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@ -1,5 +1,6 @@
|
|||||||
"""Model Utils"""
|
"""Model Utils"""
|
||||||
|
|
||||||
|
import ctypes
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@ -283,6 +284,35 @@ def post_process_yolox(
|
|||||||
### ONNX Utilities
|
### ONNX Utilities
|
||||||
|
|
||||||
|
|
||||||
|
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
|
||||||
|
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
|
||||||
|
|
||||||
|
For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
|
||||||
|
requires all intermediate tensors to be live simultaneously, so peak GPU memory
|
||||||
|
is 15-20× the model file size and cannot be safely capped. This function is
|
||||||
|
intended for embedding ONNXModelRunner sessions only.
|
||||||
|
|
||||||
|
Returns a limit derived from:
|
||||||
|
- Floor: model file size × peak_multiplier (≥ 2 GB)
|
||||||
|
- Ceiling: 80% of total GPU VRAM
|
||||||
|
Falls back to 4 GB if the CUDA runtime query fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
libcudart = ctypes.CDLL("libcudart.so")
|
||||||
|
free_bytes = ctypes.c_size_t()
|
||||||
|
total_bytes = ctypes.c_size_t()
|
||||||
|
libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
|
||||||
|
total = total_bytes.value
|
||||||
|
except Exception:
|
||||||
|
logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
|
||||||
|
return 4 * 1024**3
|
||||||
|
|
||||||
|
peak_multiplier = 14 if cuda_graph else 7
|
||||||
|
floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
|
||||||
|
ceiling = int(total * 0.80)
|
||||||
|
return min(floor, ceiling)
|
||||||
|
|
||||||
|
|
||||||
def get_ort_providers(
|
def get_ort_providers(
|
||||||
force_cpu: bool = False,
|
force_cpu: bool = False,
|
||||||
device: str | None = "AUTO",
|
device: str | None = "AUTO",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user