fix: handle CUDA query failure and free-VRAM contention in gpu_mem_limit

Targeted follow-ups to the embeddings_manager ORT leak fix that affect all
GPU-resident embedding models (Jina text+vision, PaddleOCR det+rec, ArcFace,
YOLOv9 license plate). Detection-side YOLO runners are unaffected since
CudaGraphRunner does not call compute_cuda_mem_limit.

- compute_cuda_mem_limit now checks the cudaMemGetInfo return code instead
  of trusting that a non-throwing call populated the buffers. Previously a
  non-zero rc left both pointers at 0, producing gpu_mem_limit=0 and
  immediate session OOM rather than the documented 4 GB fallback.
- The limit also factors in currently-free VRAM (free * 0.9), not just
  total. On a shared GPU where co-resident embedding sessions have already
  consumed most of the device, capping at 80% of total still over-allocates.
- The CUDA graph fallback path now logs the underlying exception text so
  failures (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.)
  stop being swallowed by the bare except.

Tests cover all three regression paths plus updated existing tests that
now require cudaMemGetInfo to return cudaSuccess explicitly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
felalex 2026-05-02 23:44:18 -07:00
parent 71060805f0
commit 1717f21f69
3 changed files with 103 additions and 28 deletions

View File

@ -625,10 +625,11 @@ def get_optimized_runner(
),
cuda_graph_options["device_id"],
)
except Exception:
except Exception as e:
logger.warning(
"CUDA graph capture failed for %s, falling back to standard ONNX runner",
"CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
model_path,
e,
)
if (

View File

@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase):
class TestComputeCudaMemLimit(unittest.TestCase):
@staticmethod
def _fake_mem_get_info(free_value: int, total_value: int):
def _impl(free_ptr, total_ptr):
free_ptr._obj.value = free_value
total_ptr._obj.value = total_value
return 0 # cudaSuccess
return _impl
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024) # 200 MB model
def test_respects_ceiling(self, mock_getsize, mock_cdll):
"""gpu_mem_limit must not exceed 80% of total VRAM."""
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_respects_ceiling(self, _mock_getsize, mock_cdll):
from frigate.util.model import compute_cuda_mem_limit
total_vram = 24 * 1024**3 # 24 GB
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
def fake_mem_get_info(free_ptr, total_ptr):
total_ptr._obj.value = total_vram
free_ptr._obj.value = total_vram
mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertLessEqual(limit, int(total_vram * 0.80))
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
def test_fallback_on_cuda_unavailable(self, _mock_cdll):
"""Falls back to 4 GB when CUDA runtime is not available."""
from frigate.util.model import compute_cuda_mem_limit
limit = compute_cuda_mem_limit("/fake/model.onnx")
self.assertEqual(limit, 4 * 1024**3)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=50 * 1024 * 1024) # 50 MB model
def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
"""Floor must be at least 2 GB regardless of model size."""
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
from frigate.util.model import compute_cuda_mem_limit
total_vram = 24 * 1024**3
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
def fake_mem_get_info(free_ptr, total_ptr):
total_ptr._obj.value = total_vram
free_ptr._obj.value = total_vram
mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
total_vram, total_vram
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertGreaterEqual(limit, 2 * 1024**3)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
# producing gpu_mem_limit=0 and immediate session OOM.
from frigate.util.model import compute_cuda_mem_limit
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertEqual(limit, 4 * 1024**3)
@patch("frigate.util.model.ctypes.CDLL")
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
# Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
# not 80% of total — co-resident embedding sessions would OOM otherwise.
from frigate.util.model import compute_cuda_mem_limit
mock_lib = MagicMock()
mock_cdll.return_value = mock_lib
mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
3 * 1024**3, 24 * 1024**3
)
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
class TestOrtLeakFixRegression(unittest.TestCase):
"""Regression guards for the embeddings_manager ORT memory leak fix.
@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase):
)
class TestCudaGraphFallbackLogsException(unittest.TestCase):
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
@patch(
"frigate.detectors.detection_runners.get_ort_providers",
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
)
@patch(
"frigate.detectors.detection_runners.is_rknn_compatible",
return_value=False,
)
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
def test_fallback_warning_includes_exception_text(
self, _gs, _cdll, _rknn, _gp, mock_session
):
# Concern #1: the bare `except Exception:` swallowed the underlying
# ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
# turning a debuggable failure into an opaque "fell back to ONNX runner".
from frigate.detectors.detection_runners import get_optimized_runner
from frigate.detectors.detector_config import ModelTypeEnum
mock_session.side_effect = [
RuntimeError("cudaErrorStreamCaptureUnsupported"),
MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
]
with self.assertLogs(
"frigate.detectors.detection_runners", level="WARNING"
) as captured:
get_optimized_runner(
"/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
)
joined = "\n".join(captured.output)
self.assertIn("CUDA graph capture failed", joined)
self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
if __name__ == "__main__":
unittest.main()

View File

@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
Returns a limit derived from:
- Floor: model file size × peak_multiplier ( 2 GB)
- Ceiling: 80% of total GPU VRAM
- Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
Falls back to 4 GB if the CUDA runtime query fails.
"""
try:
libcudart = ctypes.CDLL("libcudart.so")
free_bytes = ctypes.c_size_t()
total_bytes = ctypes.c_size_t()
libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
rc = libcudart.cudaMemGetInfo(
ctypes.byref(free_bytes), ctypes.byref(total_bytes)
)
if rc != 0 or total_bytes.value == 0:
raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
total = total_bytes.value
except Exception:
logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
free = free_bytes.value
except Exception as e:
logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
return 4 * 1024**3
peak_multiplier = 14 if cuda_graph else 7
floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
ceiling = int(total * 0.80)
return min(floor, ceiling)
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
return min(floor, int(total * 0.80), int(free * 0.90))
def get_ort_providers(