Add config for selecting GPU. Fix Async inference. Update documentation.

2026-02-03 09:45:22 +03:00 · 2022-12-29 12:26:29 -05:00 · 2022-12-29 12:26:29 -05:00 · bd4983d203
commit bd4983d203
parent de251c2c21
2 changed files with 62 additions and 22 deletions
--- a/docs/docs/configuration/detectors.md
+++ b/docs/docs/configuration/detectors.md
@ -155,7 +155,19 @@ NVidia GPUs may be used for object detection using the TensorRT libraries.

 ### Minimum Hardware Support

-**TODO**
+The TensorRT detector uses the 11.x series of CUDA libraries which have minor version compatibility. The minimum driver version on the host system must be `>=450.80.02`. Also the GPU must support a Compute Capability of `5.0` or greater. This generally correlates to a Maxwell-era GPU or newer, check the NVIDIA GPU Compute Capability table linked below.
+
+> **TODO:** NVidia claims support on compute 3.5 and 3.7, but marks it as deprecated. This would have some, but not all, Kepler GPUs as possibly working. This needs testing before making any claims of support.
+
+There are improved capabilities in newer GPU architectures that TensorRT can benefit from, such as INT8 operations and Tensor cores. The features compatible with your hardware will be optimized when the model is converted to a trt file. Currently the script provided for generating the model provides a switch to enable/disable FP16 operations. If you wish to use newer features such as INT8 optimization, more work is required.
+
+#### Compatibility References:
+
+[NVIDIA TensorRT Support Matrix](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-841/support-matrix/index.html)
+
+[NVIDIA CUDA Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
+
+[NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)

 ### Generate Models

@ -165,7 +177,7 @@ To generate the model files, create a new folder to save the models, download th

 ```bash
 mkdir trt-models
-wget https://github.com/blakeblackshear/frigate/raw/master/docker/tensorrt_models.sh
+wget https://raw.githubusercontent.com/blakeblackshear/frigate/nvidia-detector/docker/tensorrt_models.sh
 chmod +x tensorrt_models.sh
 docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh
 ```
@ -202,19 +214,21 @@ yolov7-tiny-416

 ### Configuration Parameters

-**TODO**
+The TensorRT detector can be selected by specifying `tensorrt` as the model type. The GPU will need to be passed through to the docker container using the same methods described in the [Hardware Acceleration](hardware_acceleration.md#nvidia-gpu) section. If you pass through multiple GPUs, you can select which GPU is used for a detector with the `device` configuration parameter. The `device` parameter is an integer value of the GPU index, as shown by `nvidia-smi` within the container.

-Sample:
+The TensorRT detector uses `.trt` model files that are located in `/trt-models/` by default. These model file path and dimensions used will depend on which model you have generated.

 ```yaml
 detectors:
  tensorrt:
    type: tensorrt
+    device: 0 #This is the default, select the first GPU

 model:
  path: /trt-models/yolov7-tiny-416.trt
  labelmap_path: /trt-models/coco_91cl.txt
  input_tensor: nchw
+  input_pixel_format: rgb
  width: 416
  height: 416
 ```
--- a/frigate/detectors/plugins/tensorrt.py
+++ b/frigate/detectors/plugins/tensorrt.py
@ -46,7 +46,7 @@ if TRT_SUPPORT:

 class TensorRTDetectorConfig(BaseDetectorConfig):
    type: Literal[DETECTOR_KEY]
-    device: str = Field(default=None, title="Device Type")
+    device: int = Field(default=0, title="GPU Device Index")


 class HostDeviceMem(object):
@ -90,9 +90,8 @@ class TensorRtDetector(DetectionApi):
                e,
            )

-        self.runtime = trt.Runtime(self.trt_logger)
-        with open(model_path, "rb") as f:
-            return self.runtime.deserialize_cuda_engine(f.read())
+        with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())

    def _get_input_shape(self):
        """Get input shape of the TensorRT YOLO engine."""
@ -120,7 +119,6 @@ class TensorRtDetector(DetectionApi):
        outputs = []
        bindings = []
        output_idx = 0
-        err, stream = cuda.cuStreamCreate(0)
        for binding in self.engine:
            binding_dims = self.engine.get_binding_shape(binding)
            if len(binding_dims) == 4:
@ -159,7 +157,7 @@ class TensorRtDetector(DetectionApi):
                output_idx += 1
        assert len(inputs) == 1, f"inputs len was {len(inputs)}"
        assert len(outputs) == 1, f"output len was {len(outputs)}"
-        return inputs, outputs, bindings, stream
+        return inputs, outputs, bindings

    def _do_inference(self):
        """do_inference (for TensorRT 7.0+)
@ -167,15 +165,33 @@ class TensorRtDetector(DetectionApi):
        dimension networks.
        Inputs and outputs are expected to be lists of HostDeviceMem objects.
        """
+        # Push CUDA Context
+        cuda.cuCtxPushCurrent(self.cu_ctx)
+
        # Transfer input data to the GPU.
-        [cuda.cuMemcpyHtoD(inp.device, inp.host, inp.nbytes) for inp in self.inputs]
+        [
+            cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
+            for inp in self.inputs
+        ]
+
        # Run inference.
-        if not self.context.execute_v2(bindings=self.bindings):
+        if not self.context.execute_async_v2(
+            bindings=self.bindings, stream_handle=self.stream
+        ):
            logger.warn(f"Execute returned false")
+
        # Transfer predictions back from the GPU.
-        [cuda.cuMemcpyDtoH(out.host, out.device, out.nbytes) for out in self.outputs]
+        [
+            cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
+            for out in self.outputs
+        ]
+
        # Synchronize the stream
-        # cuda.cuStreamSynchronize(self.stream)
+        cuda.cuStreamSynchronize(self.stream)
+
+        # Pop CUDA Context
+        cuda.cuCtxPopCurrent()
+
        # Return only the host outputs.
        return [
            np.array(
@ -193,9 +209,18 @@ class TensorRtDetector(DetectionApi):
        assert (
            cuda_err == cuda.CUresult.CUDA_SUCCESS
        ), f"Failed to initialize cuda {cuda_err}"
-        err, self.cu_ctx = cuda.cuCtxCreate(cuda.CUctx_flags.CU_CTX_MAP_HOST, 0)
+        err, dev_count = cuda.cuDeviceGetCount()
+        logger.debug(f"Num Available Devices: {dev_count}")
+        assert (
+            detector_config.device < dev_count
+        ), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."
+        err, self.cu_ctx = cuda.cuCtxCreate(
+            cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device
+        )
+
        self.conf_th = 0.4  ##TODO: model config parameter
        self.nms_threshold = 0.4
+        err, self.stream = cuda.cuStreamCreate(0)
        self.trt_logger = TrtLogger()
        self.engine = self._load_engine(detector_config.model.path)
        self.input_shape = self._get_input_shape()
@ -206,7 +231,6 @@ class TensorRtDetector(DetectionApi):
                self.inputs,
                self.outputs,
                self.bindings,
-                self.stream,
            ) = self._allocate_buffers()
        except Exception as e:
            logger.error(e)
@ -217,12 +241,14 @@ class TensorRtDetector(DetectionApi):

    def __del__(self):
        """Free CUDA memories."""
+        if self.outputs is not None:
            del self.outputs
+        if self.inputs is not None:
            del self.inputs
+        if self.stream is not None:
            cuda.cuStreamDestroy(self.stream)
            del self.stream
        del self.engine
-        del self.runtime
        del self.context
        del self.trt_logger
        cuda.cuCtxDestroy(self.cu_ctx)