fix: omit gpu_mem_limit on CUDA query failure instead of guessing 4 GB

When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit
now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to
manage its own arena (grow-as-needed up to device capacity).

Tradeoff documented in the docstring:
- Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices
  (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken
  /dev/nvidia* container passthroughs, where requesting 4 GB caused
  cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs
  (24 GB RTX 3090 with 20 GB free), needlessly starving the session.
- The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are
  independent of the BFC arena cap, so dropping the cap on the failure path
  does not reintroduce the leak.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
felalex 2026-05-02 23:58:06 -07:00
parent 6a16fa667b
commit 351bef936f
3 changed files with 79 additions and 22 deletions

View File

@ -636,8 +636,12 @@ def get_optimized_runner(
)
except Exception as e:
logger.warning(
"CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
"CUDA graph capture failed for model_type=%s path=%s "
"device_id=%s providers=%s; falling back to standard ONNX runner: %s",
model_type,
model_path,
cuda_graph_options.get("device_id"),
providers,
e,
)
@ -651,10 +655,9 @@ def get_optimized_runner(
options.pop(0)
if providers and providers[0] == "CUDAExecutionProvider":
options[0] = {
**options[0],
"gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
}
gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
if gpu_mem_limit is not None:
options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
return ONNXModelRunner(
ort.InferenceSession(

View File

@ -125,11 +125,12 @@ class TestComputeCudaMemLimit(unittest.TestCase):
self.assertLessEqual(limit, int(total_vram * 0.80))
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
def test_fallback_on_cuda_unavailable(self, _mock_cdll):
def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
# See compute_cuda_mem_limit docstring for the tradeoff: returning a
# hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
from frigate.util.model import compute_cuda_mem_limit
limit = compute_cuda_mem_limit("/fake/model.onnx")
self.assertEqual(limit, 4 * 1024**3)
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
@ -148,17 +149,17 @@ class TestComputeCudaMemLimit(unittest.TestCase):
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
# producing gpu_mem_limit=0 and immediate session OOM.
# producing gpu_mem_limit=0 and immediate session OOM. We now return
# None so the caller omits gpu_mem_limit and ORT manages the arena.
from frigate.util.model import compute_cuda_mem_limit
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertEqual(limit, 4 * 1024**3)
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
@ -341,6 +342,44 @@ class TestOrtLeakFixRegression(unittest.TestCase):
)
class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
"""When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_no_gpu_mem_limit_key_when_cuda_query_fails(
self, _gs, _cdll, _rknn, _gp, mock_session
):
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.embeddings.types import EnrichmentModelTypeEnum
mock_session.return_value.get_inputs.return_value = []
mock_session.return_value.get_outputs.return_value = []
get_optimized_runner(
"/fake/jina.onnx",
device="GPU",
model_type=EnrichmentModelTypeEnum.jina_v2.value,
)
provider_opts = mock_session.call_args.kwargs["provider_options"]
self.assertNotIn(
"gpu_mem_limit",
provider_opts[0],
"Must omit (not set to 0, not set to a guess) when query fails",
)
class TestCudaGraphFallbackLogsException(unittest.TestCase):
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(

View File

@ -284,18 +284,33 @@ def post_process_yolox(
### ONNX Utilities
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
For CudaGraphRunner (YOLO detection) do NOT call this CUDA graph capture
For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
requires all intermediate tensors to be live simultaneously, so peak GPU memory
is 15-20× the model file size and cannot be safely capped. This function is
is 15-20x the model file size and cannot be safely capped. This function is
intended for embedding ONNXModelRunner sessions only.
Returns a limit derived from:
- Floor: model file size × peak_multiplier ( 2 GB)
- Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
Falls back to 4 GB if the CUDA runtime query fails.
- min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
Returns None if the CUDA runtime query fails. The caller MUST then omit
gpu_mem_limit from provider_options so ORT falls back to its own default
(grow-as-needed up to device capacity).
Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
but that number is wrong for both ends of the spectrum:
- On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
any container where /dev/nvidia* passthrough is broken, asking for 4 GB
causes ORT session init to fail with cudaErrorMemoryAllocation.
- On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
starves the session and forces extra arena reallocations.
Returning None and letting ORT manage the arena itself is the
least-surprising behavior when we cannot actually measure VRAM. The
leak vectors this PR addresses (mem_pattern, mallopt) are independent
of the BFC arena cap, so dropping the cap on the failure path does
not reintroduce the leak.
"""
try:
libcudart = ctypes.CDLL("libcudart.so")
@ -309,14 +324,14 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
total = total_bytes.value
free = free_bytes.value
except Exception as e:
logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
return 4 * 1024**3
logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
return None
peak_multiplier = 14 if cuda_graph else 7
floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
return min(floor, int(total * 0.80), int(free * 0.90))
return min(desired, int(total * 0.80), int(free * 0.90))
def get_ort_providers(