From ef59366c2c7973b9177a7411222832ee0c7f20fe Mon Sep 17 00:00:00 2001 From: "Nabheet S. Sandhu" Date: Sun, 7 Jun 2026 21:42:13 -0600 Subject: [PATCH] fix(cuda): gracefully fallback when CUDA graph capture is unsupported When enable_cuda_graph is True and the model contains operations that cannot be fully partitioned to CUDA (e.g. Memcpy nodes from reshape or concat ops), ONNX Runtime fails session creation fatally. This prevents models like YOLOv8 from running on GPU with device: cuda. Wrap the session creation in try/except and fall back to CUDA execution without graph capture. Both paths still use CudaGraphRunner since it works as a regular IOBinding runner even without graph capture. --- frigate/detectors/detection_runners.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py index ee465b3d51..57b00adfcf 100644 --- a/frigate/detectors/detection_runners.py +++ b/frigate/detectors/detection_runners.py @@ -601,16 +601,31 @@ def get_optimized_runner( CudaGraphRunner.is_model_supported(model_type) and providers[0] == "CUDAExecutionProvider" ): - options[0] = { + # Try to enable CUDA graph capture for maximum performance. + # If the model has ops that can't be fully partitioned to CUDA + # (e.g. Memcpy nodes), fall back gracefully without graph capture. + graph_options = { **options[0], "enable_cuda_graph": True, } - return CudaGraphRunner( - ort.InferenceSession( + try: + session = ort.InferenceSession( + model_path, + providers=providers, + provider_options=[graph_options] + options[1:], + ) + except Exception: + logger.warning( + "CUDA graph capture not supported for this model, " + "falling back to CUDA execution without graph capture" + ) + session = ort.InferenceSession( model_path, providers=providers, provider_options=options, - ), + ) + return CudaGraphRunner( + session, options[0]["device_id"], )