diff --git a/frigate/detectors/base_runner.py b/frigate/detectors/base_runner.py index cd3a3486a..7597d690f 100644 --- a/frigate/detectors/base_runner.py +++ b/frigate/detectors/base_runner.py @@ -6,10 +6,12 @@ from typing import Any import onnxruntime as ort from frigate.detectors.plugins.openvino import OpenVINOModelRunner +from frigate.detectors.plugins.onnx import CudaGraphRunner from frigate.embeddings.onnx.runner import RKNNModelRunner from frigate.util.model import get_ort_providers from frigate.util.rknn_converter import auto_convert_model, is_rknn_compatible + class BaseModelRunner(ABC): """Abstract base class for model runners.""" @@ -36,25 +38,8 @@ class BaseModelRunner(ABC): class ONNXModelRunner(BaseModelRunner): """Run ONNX models using ONNX Runtime.""" - def __init__(self, model_path: str, device: str, requires_fp16: bool = False): - super().__init__(model_path, device) - self.requires_fp16 = requires_fp16 - self.ort: ort.InferenceSession = None - self._load_model() - - def _load_model(self): - """Load the ONNX model.""" - providers, options = get_ort_providers( - self.device == "CPU", - self.device, - self.requires_fp16, - ) - - self.ort = ort.InferenceSession( - self.model_path, - providers=providers, - provider_options=options, - ) + def __init__(self, ort: ort.InferenceSession): + self.ort = ort def get_input_names(self) -> list[str]: return [input.name for input in self.ort.get_inputs()] @@ -66,24 +51,32 @@ class ONNXModelRunner(BaseModelRunner): def run(self, input: dict[str, Any]) -> Any | None: return self.ort.run(None, input) + def get_optimized_runner(model_path: str, device: str, **kwargs) -> BaseModelRunner: """Get an optimized runner for the hardware.""" if device == "CPU": return ONNXModelRunner(model_path, device, **kwargs) - + if is_rknn_compatible(model_path): rknn_path = auto_convert_model(model_path) if rknn_path: return RKNNModelRunner(rknn_path, device) - providers, options = get_ort_providers( - device == "CPU", - device, - **kwargs - ) + providers, options = get_ort_providers(device == "CPU", device, **kwargs) if "OpenVINOExecutionProvider" in providers: return OpenVINOModelRunner(model_path, device, **kwargs) - - return ONNXModelRunner(model_path, device, **kwargs) \ No newline at end of file + + ort = ort.InferenceSession( + model_path, + providers=providers, + provider_options=options, + ) + + cuda_idx = providers.index("CUDAExecutionProvider") + + if cuda_idx == 0: + return CudaGraphRunner(ort, options[cuda_idx].get("device_id", 0)) + + return ONNXModelRunner(model_path, device, **kwargs) diff --git a/frigate/detectors/plugins/onnx.py b/frigate/detectors/plugins/onnx.py index 4f903aa1f..ff9637268 100644 --- a/frigate/detectors/plugins/onnx.py +++ b/frigate/detectors/plugins/onnx.py @@ -5,6 +5,7 @@ import onnxruntime as ort from pydantic import Field from typing_extensions import Literal +from frigate.detectors.base_runner import BaseModelRunner from frigate.detectors.detection_api import DetectionApi from frigate.detectors.detector_config import ( BaseDetectorConfig, @@ -23,10 +24,13 @@ logger = logging.getLogger(__name__) DETECTOR_KEY = "onnx" -class CudaGraphRunner: +class CudaGraphRunner(BaseModelRunner): """Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding. This runner assumes a single tensor input and binds all model outputs. + + NOTE: CUDA Graphs limit supported model operations, so they are not usable + for more complex models like CLIP or PaddleOCR. """ def __init__(self, session: ort.InferenceSession, cuda_device_id: int): @@ -39,6 +43,14 @@ class CudaGraphRunner: self._input_ortvalue: ort.OrtValue | None = None self._output_ortvalues: ort.OrtValue | None = None + def get_input_names(self) -> list[str]: + """Get input names for the model.""" + return [input.name for input in self._session.get_inputs()] + + def get_input_width(self) -> int: + """Get the input width of the model.""" + return self._session.get_inputs()[0].shape[3] + def run(self, input_name: str, tensor_input: np.ndarray): tensor_input = np.ascontiguousarray(tensor_input) @@ -114,7 +126,6 @@ class ONNXDetector(DetectionApi): try: if "CUDAExecutionProvider" in providers: - cuda_idx = providers.index("CUDAExecutionProvider") self._cuda_device_id = options[cuda_idx].get("device_id", 0) if options[cuda_idx].get("enable_cuda_graph"): diff --git a/frigate/detectors/plugins/openvino.py b/frigate/detectors/plugins/openvino.py index 895612952..6603df1a1 100644 --- a/frigate/detectors/plugins/openvino.py +++ b/frigate/detectors/plugins/openvino.py @@ -41,18 +41,20 @@ class OpenVINOModelRunner: def __init__(self, model_path: str, device: str, **kwargs): self.model_path = model_path self.device = device - + if not os.path.isfile(model_path): raise FileNotFoundError(f"OpenVINO model file {model_path} not found.") - + self.ov_core = ov.Core() - + # Apply performance optimization self.ov_core.set_property(device, {"PERF_COUNT": "NO"}) - + # Compile model - self.compiled_model = self.ov_core.compile_model(model=model_path, device_name=device) - + self.compiled_model = self.ov_core.compile_model( + model=model_path, device_name=device + ) + # Create reusable inference request self.infer_request = self.compiled_model.create_infer_request() input_shape = self.compiled_model.inputs[0].get_shape() @@ -70,24 +72,24 @@ class OpenVINOModelRunner: def run(self, input_data: np.ndarray) -> list[np.ndarray]: """Run inference with the model. - + Args: input_data: Input tensor data - + Returns: List of output tensors """ # Copy input data to pre-allocated tensor np.copyto(self.input_tensor.data, input_data) - + # Run inference self.infer_request.infer(self.input_tensor) - + # Get all output tensors outputs = [] for i in range(len(self.compiled_model.outputs)): outputs.append(self.infer_request.get_output_tensor(i).data) - + return outputs @@ -110,16 +112,15 @@ class OvDetector(DetectionApi): self.w = detector_config.model.width self.runner = OpenVINOModelRunner( - model_path=detector_config.model.path, - device=detector_config.device + model_path=detector_config.model.path, device=detector_config.device ) - + # For dfine models, also pre-allocate target sizes tensor if self.ov_model_type == ModelTypeEnum.dfine: self.target_sizes_tensor = ov.Tensor( np.array([[self.h, self.w]], dtype=np.int64) ) - + self.model_invalid = False if self.ov_model_type not in self.supported_models: @@ -173,7 +174,9 @@ class OvDetector(DetectionApi): self.output_indexes = 0 while True: try: - tensor_shape = self.runner.compiled_model.output(self.output_indexes).shape + tensor_shape = self.runner.compiled_model.output( + self.output_indexes + ).shape logger.info( f"Model Output-{self.output_indexes} Shape: {tensor_shape}" ) @@ -205,12 +208,12 @@ class OvDetector(DetectionApi): # Use named inputs for dfine models inputs = { "images": tensor_input, - "orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64) + "orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64), } outputs = self.runner.run_with_named_inputs(inputs) tensor_output = ( outputs["output0"], - outputs["output1"], + outputs["output1"], outputs["output2"], ) return post_process_dfine(tensor_output, self.w, self.h)