diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py index 277b1c542..e397d73fb 100644 --- a/frigate/detectors/detection_runners.py +++ b/frigate/detectors/detection_runners.py @@ -625,10 +625,11 @@ def get_optimized_runner( ), cuda_graph_options["device_id"], ) - except Exception: + except Exception as e: logger.warning( - "CUDA graph capture failed for %s, falling back to standard ONNX runner", + "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s", model_path, + e, ) if ( diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py index c914b9bdc..e8079f1c6 100644 --- a/frigate/test/test_detection_runners.py +++ b/frigate/test/test_detection_runners.py @@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase): class TestComputeCudaMemLimit(unittest.TestCase): + @staticmethod + def _fake_mem_get_info(free_value: int, total_value: int): + def _impl(free_ptr, total_ptr): + free_ptr._obj.value = free_value + total_ptr._obj.value = total_value + return 0 # cudaSuccess + + return _impl + @patch("frigate.util.model.ctypes.CDLL") - @patch("os.path.getsize", return_value=200 * 1024 * 1024) # 200 MB model - def test_respects_ceiling(self, mock_getsize, mock_cdll): - """gpu_mem_limit must not exceed 80% of total VRAM.""" + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_respects_ceiling(self, _mock_getsize, mock_cdll): from frigate.util.model import compute_cuda_mem_limit - total_vram = 24 * 1024**3 # 24 GB + total_vram = 24 * 1024**3 mock_lib = MagicMock() mock_cdll.return_value = mock_lib - - def fake_mem_get_info(free_ptr, total_ptr): - total_ptr._obj.value = total_vram - free_ptr._obj.value = total_vram - - mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) self.assertLessEqual(limit, int(total_vram * 0.80)) @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) def test_fallback_on_cuda_unavailable(self, _mock_cdll): - """Falls back to 4 GB when CUDA runtime is not available.""" from frigate.util.model import compute_cuda_mem_limit limit = compute_cuda_mem_limit("/fake/model.onnx") self.assertEqual(limit, 4 * 1024**3) @patch("frigate.util.model.ctypes.CDLL") - @patch("os.path.getsize", return_value=50 * 1024 * 1024) # 50 MB model - def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll): - """Floor must be at least 2 GB regardless of model size.""" + @patch("os.path.getsize", return_value=50 * 1024 * 1024) + def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll): from frigate.util.model import compute_cuda_mem_limit total_vram = 24 * 1024**3 mock_lib = MagicMock() mock_cdll.return_value = mock_lib - - def fake_mem_get_info(free_ptr, total_ptr): - total_ptr._obj.value = total_vram - free_ptr._obj.value = total_vram - - mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + total_vram, total_vram + ) limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) self.assertGreaterEqual(limit, 2 * 1024**3) + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll): + # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0, + # producing gpu_mem_limit=0 and immediate session OOM. + from frigate.util.model import compute_cuda_mem_limit + + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertEqual(limit, 4 * 1024**3) + + @patch("frigate.util.model.ctypes.CDLL") + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll): + # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9, + # not 80% of total — co-resident embedding sessions would OOM otherwise. + from frigate.util.model import compute_cuda_mem_limit + + mock_lib = MagicMock() + mock_cdll.return_value = mock_lib + mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info( + 3 * 1024**3, 24 * 1024**3 + ) + + limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False) + self.assertLessEqual(limit, int(3 * 1024**3 * 0.90)) + class TestOrtLeakFixRegression(unittest.TestCase): """Regression guards for the embeddings_manager ORT memory leak fix. @@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase): ) +class TestCudaGraphFallbackLogsException(unittest.TestCase): + @patch("frigate.detectors.detection_runners.ort.InferenceSession") + @patch( + "frigate.detectors.detection_runners.get_ort_providers", + return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]), + ) + @patch( + "frigate.detectors.detection_runners.is_rknn_compatible", + return_value=False, + ) + @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda")) + @patch("os.path.getsize", return_value=200 * 1024 * 1024) + def test_fallback_warning_includes_exception_text( + self, _gs, _cdll, _rknn, _gp, mock_session + ): + # Concern #1: the bare `except Exception:` swallowed the underlying + # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.), + # turning a debuggable failure into an opaque "fell back to ONNX runner". + from frigate.detectors.detection_runners import get_optimized_runner + from frigate.detectors.detector_config import ModelTypeEnum + + mock_session.side_effect = [ + RuntimeError("cudaErrorStreamCaptureUnsupported"), + MagicMock(get_inputs=lambda: [], get_outputs=lambda: []), + ] + + with self.assertLogs( + "frigate.detectors.detection_runners", level="WARNING" + ) as captured: + get_optimized_runner( + "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value + ) + + joined = "\n".join(captured.output) + self.assertIn("CUDA graph capture failed", joined) + self.assertIn("cudaErrorStreamCaptureUnsupported", joined) + + if __name__ == "__main__": unittest.main() diff --git a/frigate/util/model.py b/frigate/util/model.py index d0b8721cc..9867115a3 100644 --- a/frigate/util/model.py +++ b/frigate/util/model.py @@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int: Returns a limit derived from: - Floor: model file size × peak_multiplier (≥ 2 GB) - - Ceiling: 80% of total GPU VRAM + - Ceiling: min(80% of total VRAM, 90% of currently free VRAM) Falls back to 4 GB if the CUDA runtime query fails. """ try: libcudart = ctypes.CDLL("libcudart.so") free_bytes = ctypes.c_size_t() total_bytes = ctypes.c_size_t() - libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes)) + rc = libcudart.cudaMemGetInfo( + ctypes.byref(free_bytes), ctypes.byref(total_bytes) + ) + if rc != 0 or total_bytes.value == 0: + raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}") total = total_bytes.value - except Exception: - logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback") + free = free_bytes.value + except Exception as e: + logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e) return 4 * 1024**3 peak_multiplier = 14 if cuda_graph else 7 floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3) - ceiling = int(total * 0.80) - return min(floor, ceiling) + # Honor free VRAM so co-resident embedding sessions (jina text + vision, + # paddleocr det + rec, arcface) don't OOM each other on shared GPUs. + return min(floor, int(total * 0.80), int(free * 0.90)) def get_ort_providers(