mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-09 06:55:28 +03:00
fix: omit gpu_mem_limit on CUDA query failure instead of guessing 4 GB
When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to manage its own arena (grow-as-needed up to device capacity). Tradeoff documented in the docstring: - Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken /dev/nvidia* container passthroughs, where requesting 4 GB caused cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs (24 GB RTX 3090 with 20 GB free), needlessly starving the session. - The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are independent of the BFC arena cap, so dropping the cap on the failure path does not reintroduce the leak. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
6a16fa667b
commit
351bef936f
@ -636,8 +636,12 @@ def get_optimized_runner(
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
|
"CUDA graph capture failed for model_type=%s path=%s "
|
||||||
|
"device_id=%s providers=%s; falling back to standard ONNX runner: %s",
|
||||||
|
model_type,
|
||||||
model_path,
|
model_path,
|
||||||
|
cuda_graph_options.get("device_id"),
|
||||||
|
providers,
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -651,10 +655,9 @@ def get_optimized_runner(
|
|||||||
options.pop(0)
|
options.pop(0)
|
||||||
|
|
||||||
if providers and providers[0] == "CUDAExecutionProvider":
|
if providers and providers[0] == "CUDAExecutionProvider":
|
||||||
options[0] = {
|
gpu_mem_limit = compute_cuda_mem_limit(model_path, cuda_graph=False)
|
||||||
**options[0],
|
if gpu_mem_limit is not None:
|
||||||
"gpu_mem_limit": compute_cuda_mem_limit(model_path, cuda_graph=False),
|
options[0] = {**options[0], "gpu_mem_limit": gpu_mem_limit}
|
||||||
}
|
|
||||||
|
|
||||||
return ONNXModelRunner(
|
return ONNXModelRunner(
|
||||||
ort.InferenceSession(
|
ort.InferenceSession(
|
||||||
|
|||||||
@ -125,11 +125,12 @@ class TestComputeCudaMemLimit(unittest.TestCase):
|
|||||||
self.assertLessEqual(limit, int(total_vram * 0.80))
|
self.assertLessEqual(limit, int(total_vram * 0.80))
|
||||||
|
|
||||||
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||||
def test_fallback_on_cuda_unavailable(self, _mock_cdll):
|
def test_returns_none_when_cuda_unavailable(self, _mock_cdll):
|
||||||
|
# See compute_cuda_mem_limit docstring for the tradeoff: returning a
|
||||||
|
# hardcoded fallback was wrong for low-VRAM devices (Jetson Nano, K620).
|
||||||
from frigate.util.model import compute_cuda_mem_limit
|
from frigate.util.model import compute_cuda_mem_limit
|
||||||
|
|
||||||
limit = compute_cuda_mem_limit("/fake/model.onnx")
|
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx"))
|
||||||
self.assertEqual(limit, 4 * 1024**3)
|
|
||||||
|
|
||||||
@patch("frigate.util.model.ctypes.CDLL")
|
@patch("frigate.util.model.ctypes.CDLL")
|
||||||
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
|
@patch("os.path.getsize", return_value=50 * 1024 * 1024)
|
||||||
@ -148,17 +149,17 @@ class TestComputeCudaMemLimit(unittest.TestCase):
|
|||||||
|
|
||||||
@patch("frigate.util.model.ctypes.CDLL")
|
@patch("frigate.util.model.ctypes.CDLL")
|
||||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||||
def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
|
def test_returns_none_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
|
||||||
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
|
# Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
|
||||||
# producing gpu_mem_limit=0 and immediate session OOM.
|
# producing gpu_mem_limit=0 and immediate session OOM. We now return
|
||||||
|
# None so the caller omits gpu_mem_limit and ORT manages the arena.
|
||||||
from frigate.util.model import compute_cuda_mem_limit
|
from frigate.util.model import compute_cuda_mem_limit
|
||||||
|
|
||||||
mock_lib = MagicMock()
|
mock_lib = MagicMock()
|
||||||
mock_cdll.return_value = mock_lib
|
mock_cdll.return_value = mock_lib
|
||||||
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
|
mock_lib.cudaMemGetInfo.return_value = 2 # cudaErrorMemoryAllocation
|
||||||
|
|
||||||
limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
|
self.assertIsNone(compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False))
|
||||||
self.assertEqual(limit, 4 * 1024**3)
|
|
||||||
|
|
||||||
@patch("frigate.util.model.ctypes.CDLL")
|
@patch("frigate.util.model.ctypes.CDLL")
|
||||||
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||||
@ -341,6 +342,44 @@ class TestOrtLeakFixRegression(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunnerOmitsGpuMemLimitOnCudaQueryFailure(unittest.TestCase):
|
||||||
|
"""When compute_cuda_mem_limit returns None, get_optimized_runner must NOT
|
||||||
|
inject gpu_mem_limit at all, leaving ORT's grow-as-needed default in place."""
|
||||||
|
|
||||||
|
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||||
|
@patch(
|
||||||
|
"frigate.detectors.detection_runners.get_ort_providers",
|
||||||
|
return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"frigate.detectors.detection_runners.is_rknn_compatible",
|
||||||
|
return_value=False,
|
||||||
|
)
|
||||||
|
@patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
|
||||||
|
@patch("os.path.getsize", return_value=200 * 1024 * 1024)
|
||||||
|
def test_no_gpu_mem_limit_key_when_cuda_query_fails(
|
||||||
|
self, _gs, _cdll, _rknn, _gp, mock_session
|
||||||
|
):
|
||||||
|
from frigate.detectors.detection_runners import get_optimized_runner
|
||||||
|
from frigate.embeddings.types import EnrichmentModelTypeEnum
|
||||||
|
|
||||||
|
mock_session.return_value.get_inputs.return_value = []
|
||||||
|
mock_session.return_value.get_outputs.return_value = []
|
||||||
|
|
||||||
|
get_optimized_runner(
|
||||||
|
"/fake/jina.onnx",
|
||||||
|
device="GPU",
|
||||||
|
model_type=EnrichmentModelTypeEnum.jina_v2.value,
|
||||||
|
)
|
||||||
|
|
||||||
|
provider_opts = mock_session.call_args.kwargs["provider_options"]
|
||||||
|
self.assertNotIn(
|
||||||
|
"gpu_mem_limit",
|
||||||
|
provider_opts[0],
|
||||||
|
"Must omit (not set to 0, not set to a guess) when query fails",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestCudaGraphFallbackLogsException(unittest.TestCase):
|
class TestCudaGraphFallbackLogsException(unittest.TestCase):
|
||||||
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
@patch("frigate.detectors.detection_runners.ort.InferenceSession")
|
||||||
@patch(
|
@patch(
|
||||||
|
|||||||
@ -284,18 +284,33 @@ def post_process_yolox(
|
|||||||
### ONNX Utilities
|
### ONNX Utilities
|
||||||
|
|
||||||
|
|
||||||
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
|
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
|
||||||
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
|
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
|
||||||
|
|
||||||
For CudaGraphRunner (YOLO detection) do NOT call this — CUDA graph capture
|
For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
|
||||||
requires all intermediate tensors to be live simultaneously, so peak GPU memory
|
requires all intermediate tensors to be live simultaneously, so peak GPU memory
|
||||||
is 15-20× the model file size and cannot be safely capped. This function is
|
is 15-20x the model file size and cannot be safely capped. This function is
|
||||||
intended for embedding ONNXModelRunner sessions only.
|
intended for embedding ONNXModelRunner sessions only.
|
||||||
|
|
||||||
Returns a limit derived from:
|
Returns a limit derived from:
|
||||||
- Floor: model file size × peak_multiplier (≥ 2 GB)
|
- min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
|
||||||
- Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
|
|
||||||
Falls back to 4 GB if the CUDA runtime query fails.
|
Returns None if the CUDA runtime query fails. The caller MUST then omit
|
||||||
|
gpu_mem_limit from provider_options so ORT falls back to its own default
|
||||||
|
(grow-as-needed up to device capacity).
|
||||||
|
|
||||||
|
Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
|
||||||
|
but that number is wrong for both ends of the spectrum:
|
||||||
|
- On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
|
||||||
|
any container where /dev/nvidia* passthrough is broken, asking for 4 GB
|
||||||
|
causes ORT session init to fail with cudaErrorMemoryAllocation.
|
||||||
|
- On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
|
||||||
|
starves the session and forces extra arena reallocations.
|
||||||
|
Returning None and letting ORT manage the arena itself is the
|
||||||
|
least-surprising behavior when we cannot actually measure VRAM. The
|
||||||
|
leak vectors this PR addresses (mem_pattern, mallopt) are independent
|
||||||
|
of the BFC arena cap, so dropping the cap on the failure path does
|
||||||
|
not reintroduce the leak.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
libcudart = ctypes.CDLL("libcudart.so")
|
libcudart = ctypes.CDLL("libcudart.so")
|
||||||
@ -309,14 +324,14 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
|
|||||||
total = total_bytes.value
|
total = total_bytes.value
|
||||||
free = free_bytes.value
|
free = free_bytes.value
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
|
logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
|
||||||
return 4 * 1024**3
|
return None
|
||||||
|
|
||||||
peak_multiplier = 14 if cuda_graph else 7
|
peak_multiplier = 14 if cuda_graph else 7
|
||||||
floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
|
desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
|
||||||
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
|
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
|
||||||
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
|
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
|
||||||
return min(floor, int(total * 0.80), int(free * 0.90))
|
return min(desired, int(total * 0.80), int(free * 0.90))
|
||||||
|
|
||||||
|
|
||||||
def get_ort_providers(
|
def get_ort_providers(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user