Add specific note about inability to use CUDA graphs for some models

This commit is contained in:
Nicolas Mowen 2025-09-13 19:59:39 -06:00
parent 834056a5c7
commit f49f693817
3 changed files with 54 additions and 47 deletions

View File

@ -6,10 +6,12 @@ from typing import Any
import onnxruntime as ort import onnxruntime as ort
from frigate.detectors.plugins.openvino import OpenVINOModelRunner from frigate.detectors.plugins.openvino import OpenVINOModelRunner
from frigate.detectors.plugins.onnx import CudaGraphRunner
from frigate.embeddings.onnx.runner import RKNNModelRunner from frigate.embeddings.onnx.runner import RKNNModelRunner
from frigate.util.model import get_ort_providers from frigate.util.model import get_ort_providers
from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible
class BaseModelRunner(ABC): class BaseModelRunner(ABC):
"""Abstract base class for model runners.""" """Abstract base class for model runners."""
@ -36,25 +38,8 @@ class BaseModelRunner(ABC):
class ONNXModelRunner(BaseModelRunner): class ONNXModelRunner(BaseModelRunner):
"""Run ONNX models using ONNX Runtime.""" """Run ONNX models using ONNX Runtime."""
def __init__(self, model_path: str, device: str, requires_fp16: bool = False): def __init__(self, ort: ort.InferenceSession):
super().__init__(model_path, device) self.ort = ort
self.requires_fp16 = requires_fp16
self.ort: ort.InferenceSession = None
self._load_model()
def _load_model(self):
"""Load the ONNX model."""
providers, options = get_ort_providers(
self.device == "CPU",
self.device,
self.requires_fp16,
)
self.ort = ort.InferenceSession(
self.model_path,
providers=providers,
provider_options=options,
)
def get_input_names(self) -> list[str]: def get_input_names(self) -> list[str]:
return [input.name for input in self.ort.get_inputs()] return [input.name for input in self.ort.get_inputs()]
@ -66,24 +51,32 @@ class ONNXModelRunner(BaseModelRunner):
def run(self, input: dict[str, Any]) -> Any | None: def run(self, input: dict[str, Any]) -> Any | None:
return self.ort.run(None, input) return self.ort.run(None, input)
def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRunner: def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRunner:
"""Get an optimized runner for the hardware.""" """Get an optimized runner for the hardware."""
if device == "CPU": if device == "CPU":
return ONNXModelRunner(model_path, device, **kwargs) return ONNXModelRunner(model_path, device, **kwargs)
if is_rknn_compatible(model_path): if is_rknn_compatible(model_path):
rknn_path = auto_convert_model(model_path) rknn_path = auto_convert_model(model_path)
if rknn_path: if rknn_path:
return RKNNModelRunner(rknn_path, device) return RKNNModelRunner(rknn_path, device)
providers, options = get_ort_providers( providers, options = get_ort_providers(device == "CPU", device, **kwargs)
device == "CPU",
device,
**kwargs
)
if "OpenVINOExecutionProvider" in providers: if "OpenVINOExecutionProvider" in providers:
return OpenVINOModelRunner(model_path, device, **kwargs) return OpenVINOModelRunner(model_path, device, **kwargs)
return ONNXModelRunner(model_path, device, **kwargs) ort = ort.InferenceSession(
model_path,
providers=providers,
provider_options=options,
)
cuda_idx = providers.index("CUDAExecutionProvider")
if cuda_idx == 0:
return CudaGraphRunner(ort, options[cuda_idx].get("device_id", 0))
return ONNXModelRunner(model_path, device, **kwargs)

View File

@ -5,6 +5,7 @@ import onnxruntime as ort
from pydantic import Field from pydantic import Field
from typing_extensions import Literal from typing_extensions import Literal
from frigate.detectors.base_runner import BaseModelRunner
from frigate.detectors.detection_api import DetectionApi from frigate.detectors.detection_api import DetectionApi
from frigate.detectors.detector_config import ( from frigate.detectors.detector_config import (
BaseDetectorConfig, BaseDetectorConfig,
@ -23,10 +24,13 @@ logger = logging.getLogger(__name__)
DETECTOR_KEY = "onnx" DETECTOR_KEY = "onnx"
class CudaGraphRunner: class CudaGraphRunner(BaseModelRunner):
"""Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding. """Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding.
This runner assumes a single tensor input and binds all model outputs. This runner assumes a single tensor input and binds all model outputs.
NOTE: CUDA Graphs limit supported model operations, so they are not usable
for more complex models like CLIP or PaddleOCR.
""" """
def __init__(self, session: ort.InferenceSession, cuda_device_id: int): def __init__(self, session: ort.InferenceSession, cuda_device_id: int):
@ -39,6 +43,14 @@ class CudaGraphRunner:
self._input_ortvalue: ort.OrtValue | None = None self._input_ortvalue: ort.OrtValue | None = None
self._output_ortvalues: ort.OrtValue | None = None self._output_ortvalues: ort.OrtValue | None = None
def get_input_names(self) -> list[str]:
"""Get input names for the model."""
return [input.name for input in self._session.get_inputs()]
def get_input_width(self) -> int:
"""Get the input width of the model."""
return self._session.get_inputs()[0].shape[3]
def run(self, input_name: str, tensor_input: np.ndarray): def run(self, input_name: str, tensor_input: np.ndarray):
tensor_input = np.ascontiguousarray(tensor_input) tensor_input = np.ascontiguousarray(tensor_input)
@ -114,7 +126,6 @@ class ONNXDetector(DetectionApi):
try: try:
if "CUDAExecutionProvider" in providers: if "CUDAExecutionProvider" in providers:
cuda_idx = providers.index("CUDAExecutionProvider")
self._cuda_device_id = options[cuda_idx].get("device_id", 0) self._cuda_device_id = options[cuda_idx].get("device_id", 0)
if options[cuda_idx].get("enable_cuda_graph"): if options[cuda_idx].get("enable_cuda_graph"):

View File

@ -41,18 +41,20 @@ class OpenVINOModelRunner:
def __init__(self, model_path: str, device: str, **kwargs): def __init__(self, model_path: str, device: str, **kwargs):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
if not os.path.isfile(model_path): if not os.path.isfile(model_path):
raise FileNotFoundError(f"OpenVINO model file {model_path} not found.") raise FileNotFoundError(f"OpenVINO model file {model_path} not found.")
self.ov_core = ov.Core() self.ov_core = ov.Core()
# Apply performance optimization # Apply performance optimization
self.ov_core.set_property(device, {"PERF_COUNT": "NO"}) self.ov_core.set_property(device, {"PERF_COUNT": "NO"})
# Compile model # Compile model
self.compiled_model = self.ov_core.compile_model(model=model_path, device_name=device) self.compiled_model = self.ov_core.compile_model(
model=model_path, device_name=device
)
# Create reusable inference request # Create reusable inference request
self.infer_request = self.compiled_model.create_infer_request() self.infer_request = self.compiled_model.create_infer_request()
input_shape = self.compiled_model.inputs[0].get_shape() input_shape = self.compiled_model.inputs[0].get_shape()
@ -70,24 +72,24 @@ class OpenVINOModelRunner:
def run(self, input_data: np.ndarray) -> list[np.ndarray]: def run(self, input_data: np.ndarray) -> list[np.ndarray]:
"""Run inference with the model. """Run inference with the model.
Args: Args:
input_data: Input tensor data input_data: Input tensor data
Returns: Returns:
List of output tensors List of output tensors
""" """
# Copy input data to pre-allocated tensor # Copy input data to pre-allocated tensor
np.copyto(self.input_tensor.data, input_data) np.copyto(self.input_tensor.data, input_data)
# Run inference # Run inference
self.infer_request.infer(self.input_tensor) self.infer_request.infer(self.input_tensor)
# Get all output tensors # Get all output tensors
outputs = [] outputs = []
for i in range(len(self.compiled_model.outputs)): for i in range(len(self.compiled_model.outputs)):
outputs.append(self.infer_request.get_output_tensor(i).data) outputs.append(self.infer_request.get_output_tensor(i).data)
return outputs return outputs
@ -110,16 +112,15 @@ class OvDetector(DetectionApi):
self.w = detector_config.model.width self.w = detector_config.model.width
self.runner = OpenVINOModelRunner( self.runner = OpenVINOModelRunner(
model_path=detector_config.model.path, model_path=detector_config.model.path, device=detector_config.device
device=detector_config.device
) )
# For dfine models, also pre-allocate target sizes tensor # For dfine models, also pre-allocate target sizes tensor
if self.ov_model_type == ModelTypeEnum.dfine: if self.ov_model_type == ModelTypeEnum.dfine:
self.target_sizes_tensor = ov.Tensor( self.target_sizes_tensor = ov.Tensor(
np.array([[self.h, self.w]], dtype=np.int64) np.array([[self.h, self.w]], dtype=np.int64)
) )
self.model_invalid = False self.model_invalid = False
if self.ov_model_type not in self.supported_models: if self.ov_model_type not in self.supported_models:
@ -173,7 +174,9 @@ class OvDetector(DetectionApi):
self.output_indexes = 0 self.output_indexes = 0
while True: while True:
try: try:
tensor_shape = self.runner.compiled_model.output(self.output_indexes).shape tensor_shape = self.runner.compiled_model.output(
self.output_indexes
).shape
logger.info( logger.info(
f"Model Output-{self.output_indexes} Shape: {tensor_shape}" f"Model Output-{self.output_indexes} Shape: {tensor_shape}"
) )
@ -205,12 +208,12 @@ class OvDetector(DetectionApi):
# Use named inputs for dfine models # Use named inputs for dfine models
inputs = { inputs = {
"images": tensor_input, "images": tensor_input,
"orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64) "orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64),
} }
outputs = self.runner.run_with_named_inputs(inputs) outputs = self.runner.run_with_named_inputs(inputs)
tensor_output = ( tensor_output = (
outputs["output0"], outputs["output0"],
outputs["output1"], outputs["output1"],
outputs["output2"], outputs["output2"],
) )
return post_process_dfine(tensor_output, self.w, self.h) return post_process_dfine(tensor_output, self.w, self.h)