Add specific note about inability to use CUDA graphs for some models

This commit is contained in:
Nicolas Mowen 2025-09-13 19:59:39 -06:00
parent 834056a5c7
commit f49f693817
3 changed files with 54 additions and 47 deletions

View File

@ -6,10 +6,12 @@ from typing import Any
import onnxruntime as ort import onnxruntime as ort
from frigate.detectors.plugins.openvino import OpenVINOModelRunner from frigate.detectors.plugins.openvino import OpenVINOModelRunner
from frigate.detectors.plugins.onnx import CudaGraphRunner
from frigate.embeddings.onnx.runner import RKNNModelRunner from frigate.embeddings.onnx.runner import RKNNModelRunner
from frigate.util.model import get_ort_providers from frigate.util.model import get_ort_providers
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
class BaseModelRunner(ABC): class BaseModelRunner(ABC):
"""Abstract base class for model runners.""" """Abstract base class for model runners."""
@ -36,25 +38,8 @@ class BaseModelRunner(ABC):
class ONNXModelRunner(BaseModelRunner): class ONNXModelRunner(BaseModelRunner):
"""Run ONNX models using ONNX Runtime.""" """Run ONNX models using ONNX Runtime."""
def __init__(self, model_path: str, device: str, requires_fp16: bool = False): def __init__(self, ort: ort.InferenceSession):
super().__init__(model_path, device) self.ort = ort
self.requires_fp16 = requires_fp16
self.ort: ort.InferenceSession = None
self._load_model()
def _load_model(self):
"""Load the ONNX model."""
providers, options = get_ort_providers(
self.device == "CPU",
self.device,
self.requires_fp16,
)
self.ort = ort.InferenceSession(
self.model_path,
providers=providers,
provider_options=options,
)
def get_input_names(self) -> list[str]: def get_input_names(self) -> list[str]:
return [input.name for input in self.ort.get_inputs()] return [input.name for input in self.ort.get_inputs()]
@ -66,6 +51,7 @@ class ONNXModelRunner(BaseModelRunner):
def run(self, input: dict[str, Any]) -> Any | None: def run(self, input: dict[str, Any]) -> Any | None:
return self.ort.run(None, input) return self.ort.run(None, input)
def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRunner: def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRunner:
"""Get an optimized runner for the hardware.""" """Get an optimized runner for the hardware."""
if device == "CPU": if device == "CPU":
@ -77,13 +63,20 @@ def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRun
if rknn_path: if rknn_path:
return RKNNModelRunner(rknn_path, device) return RKNNModelRunner(rknn_path, device)
providers, options = get_ort_providers( providers, options = get_ort_providers(device == "CPU", device, **kwargs)
device == "CPU",
device,
**kwargs
)
if "OpenVINOExecutionProvider" in providers: if "OpenVINOExecutionProvider" in providers:
return OpenVINOModelRunner(model_path, device, **kwargs) return OpenVINOModelRunner(model_path, device, **kwargs)
ort = ort.InferenceSession(
model_path,
providers=providers,
provider_options=options,
)
cuda_idx = providers.index("CUDAExecutionProvider")
if cuda_idx == 0:
return CudaGraphRunner(ort, options[cuda_idx].get("device_id", 0))
return ONNXModelRunner(model_path, device, **kwargs) return ONNXModelRunner(model_path, device, **kwargs)

View File

@ -5,6 +5,7 @@ import onnxruntime as ort
from pydantic import Field from pydantic import Field
from typing_extensions import Literal from typing_extensions import Literal
from frigate.detectors.base_runner import BaseModelRunner
from frigate.detectors.detection_api import DetectionApi from frigate.detectors.detection_api import DetectionApi
from frigate.detectors.detector_config import ( from frigate.detectors.detector_config import (
BaseDetectorConfig, BaseDetectorConfig,
@ -23,10 +24,13 @@ logger = logging.getLogger(__name__)
DETECTOR_KEY = "onnx" DETECTOR_KEY = "onnx"
class CudaGraphRunner: class CudaGraphRunner(BaseModelRunner):
"""Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding. """Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding.
This runner assumes a single tensor input and binds all model outputs. This runner assumes a single tensor input and binds all model outputs.
NOTE: CUDA Graphs limit supported model operations, so they are not usable
for more complex models like CLIP or PaddleOCR.
""" """
def __init__(self, session: ort.InferenceSession, cuda_device_id: int): def __init__(self, session: ort.InferenceSession, cuda_device_id: int):
@ -39,6 +43,14 @@ class CudaGraphRunner:
self._input_ortvalue: ort.OrtValue | None = None self._input_ortvalue: ort.OrtValue | None = None
self._output_ortvalues: ort.OrtValue | None = None self._output_ortvalues: ort.OrtValue | None = None
def get_input_names(self) -> list[str]:
"""Get input names for the model."""
return [input.name for input in self._session.get_inputs()]
def get_input_width(self) -> int:
"""Get the input width of the model."""
return self._session.get_inputs()[0].shape[3]
def run(self, input_name: str, tensor_input: np.ndarray): def run(self, input_name: str, tensor_input: np.ndarray):
tensor_input = np.ascontiguousarray(tensor_input) tensor_input = np.ascontiguousarray(tensor_input)
@ -114,7 +126,6 @@ class ONNXDetector(DetectionApi):
try: try:
if "CUDAExecutionProvider" in providers: if "CUDAExecutionProvider" in providers:
cuda_idx = providers.index("CUDAExecutionProvider")
self._cuda_device_id = options[cuda_idx].get("device_id", 0) self._cuda_device_id = options[cuda_idx].get("device_id", 0)
if options[cuda_idx].get("enable_cuda_graph"): if options[cuda_idx].get("enable_cuda_graph"):

View File

@ -51,7 +51,9 @@ class OpenVINOModelRunner:
self.ov_core.set_property(device, {"PERF_COUNT": "NO"}) self.ov_core.set_property(device, {"PERF_COUNT": "NO"})
# Compile model # Compile model
self.compiled_model = self.ov_core.compile_model(model=model_path, device_name=device) self.compiled_model = self.ov_core.compile_model(
model=model_path, device_name=device
)
# Create reusable inference request # Create reusable inference request
self.infer_request = self.compiled_model.create_infer_request() self.infer_request = self.compiled_model.create_infer_request()
@ -110,8 +112,7 @@ class OvDetector(DetectionApi):
self.w = detector_config.model.width self.w = detector_config.model.width
self.runner = OpenVINOModelRunner( self.runner = OpenVINOModelRunner(
model_path=detector_config.model.path, model_path=detector_config.model.path, device=detector_config.device
device=detector_config.device
) )
# For dfine models, also pre-allocate target sizes tensor # For dfine models, also pre-allocate target sizes tensor
@ -173,7 +174,9 @@ class OvDetector(DetectionApi):
self.output_indexes = 0 self.output_indexes = 0
while True: while True:
try: try:
tensor_shape = self.runner.compiled_model.output(self.output_indexes).shape tensor_shape = self.runner.compiled_model.output(
self.output_indexes
).shape
logger.info( logger.info(
f"Model Output-{self.output_indexes} Shape: {tensor_shape}" f"Model Output-{self.output_indexes} Shape: {tensor_shape}"
) )
@ -205,7 +208,7 @@ class OvDetector(DetectionApi):
# Use named inputs for dfine models # Use named inputs for dfine models
inputs = { inputs = {
"images": tensor_input, "images": tensor_input,
"orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64) "orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64),
} }
outputs = self.runner.run_with_named_inputs(inputs) outputs = self.runner.run_with_named_inputs(inputs)
tensor_output = ( tensor_output = (