Add config for selecting GPU. Fix Async inference. Update documentation.

This commit is contained in:
Nate Meyer 2022-12-29 12:26:29 -05:00
parent de251c2c21
commit bd4983d203
2 changed files with 62 additions and 22 deletions

View File

@ -155,7 +155,19 @@ NVidia GPUs may be used for object detection using the TensorRT libraries.
### Minimum Hardware Support ### Minimum Hardware Support
**TODO** The TensorRT detector uses the 11.x series of CUDA libraries which have minor version compatibility. The minimum driver version on the host system must be `>=450.80.02`. Also the GPU must support a Compute Capability of `5.0` or greater. This generally correlates to a Maxwell-era GPU or newer, check the NVIDIA GPU Compute Capability table linked below.
> **TODO:** NVidia claims support on compute 3.5 and 3.7, but marks it as deprecated. This would have some, but not all, Kepler GPUs as possibly working. This needs testing before making any claims of support.
There are improved capabilities in newer GPU architectures that TensorRT can benefit from, such as INT8 operations and Tensor cores. The features compatible with your hardware will be optimized when the model is converted to a trt file. Currently the script provided for generating the model provides a switch to enable/disable FP16 operations. If you wish to use newer features such as INT8 optimization, more work is required.
#### Compatibility References:
[NVIDIA TensorRT Support Matrix](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-841/support-matrix/index.html)
[NVIDIA CUDA Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
[NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
### Generate Models ### Generate Models
@ -165,7 +177,7 @@ To generate the model files, create a new folder to save the models, download th
```bash ```bash
mkdir trt-models mkdir trt-models
wget https://github.com/blakeblackshear/frigate/raw/master/docker/tensorrt_models.sh wget https://raw.githubusercontent.com/blakeblackshear/frigate/nvidia-detector/docker/tensorrt_models.sh
chmod +x tensorrt_models.sh chmod +x tensorrt_models.sh
docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh
``` ```
@ -202,19 +214,21 @@ yolov7-tiny-416
### Configuration Parameters ### Configuration Parameters
**TODO** The TensorRT detector can be selected by specifying `tensorrt` as the model type. The GPU will need to be passed through to the docker container using the same methods described in the [Hardware Acceleration](hardware_acceleration.md#nvidia-gpu) section. If you pass through multiple GPUs, you can select which GPU is used for a detector with the `device` configuration parameter. The `device` parameter is an integer value of the GPU index, as shown by `nvidia-smi` within the container.
Sample: The TensorRT detector uses `.trt` model files that are located in `/trt-models/` by default. These model file path and dimensions used will depend on which model you have generated.
```yaml ```yaml
detectors: detectors:
tensorrt: tensorrt:
type: tensorrt type: tensorrt
device: 0 #This is the default, select the first GPU
model: model:
path: /trt-models/yolov7-tiny-416.trt path: /trt-models/yolov7-tiny-416.trt
labelmap_path: /trt-models/coco_91cl.txt labelmap_path: /trt-models/coco_91cl.txt
input_tensor: nchw input_tensor: nchw
input_pixel_format: rgb
width: 416 width: 416
height: 416 height: 416
``` ```

View File

@ -46,7 +46,7 @@ if TRT_SUPPORT:
class TensorRTDetectorConfig(BaseDetectorConfig): class TensorRTDetectorConfig(BaseDetectorConfig):
type: Literal[DETECTOR_KEY] type: Literal[DETECTOR_KEY]
device: str = Field(default=None, title="Device Type") device: int = Field(default=0, title="GPU Device Index")
class HostDeviceMem(object): class HostDeviceMem(object):
@ -90,9 +90,8 @@ class TensorRtDetector(DetectionApi):
e, e,
) )
self.runtime = trt.Runtime(self.trt_logger) with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
with open(model_path, "rb") as f: return runtime.deserialize_cuda_engine(f.read())
return self.runtime.deserialize_cuda_engine(f.read())
def _get_input_shape(self): def _get_input_shape(self):
"""Get input shape of the TensorRT YOLO engine.""" """Get input shape of the TensorRT YOLO engine."""
@ -120,7 +119,6 @@ class TensorRtDetector(DetectionApi):
outputs = [] outputs = []
bindings = [] bindings = []
output_idx = 0 output_idx = 0
err, stream = cuda.cuStreamCreate(0)
for binding in self.engine: for binding in self.engine:
binding_dims = self.engine.get_binding_shape(binding) binding_dims = self.engine.get_binding_shape(binding)
if len(binding_dims) == 4: if len(binding_dims) == 4:
@ -159,7 +157,7 @@ class TensorRtDetector(DetectionApi):
output_idx += 1 output_idx += 1
assert len(inputs) == 1, f"inputs len was {len(inputs)}" assert len(inputs) == 1, f"inputs len was {len(inputs)}"
assert len(outputs) == 1, f"output len was {len(outputs)}" assert len(outputs) == 1, f"output len was {len(outputs)}"
return inputs, outputs, bindings, stream return inputs, outputs, bindings
def _do_inference(self): def _do_inference(self):
"""do_inference (for TensorRT 7.0+) """do_inference (for TensorRT 7.0+)
@ -167,15 +165,33 @@ class TensorRtDetector(DetectionApi):
dimension networks. dimension networks.
Inputs and outputs are expected to be lists of HostDeviceMem objects. Inputs and outputs are expected to be lists of HostDeviceMem objects.
""" """
# Push CUDA Context
cuda.cuCtxPushCurrent(self.cu_ctx)
# Transfer input data to the GPU. # Transfer input data to the GPU.
[cuda.cuMemcpyHtoD(inp.device, inp.host, inp.nbytes) for inp in self.inputs] [
cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
for inp in self.inputs
]
# Run inference. # Run inference.
if not self.context.execute_v2(bindings=self.bindings): if not self.context.execute_async_v2(
bindings=self.bindings, stream_handle=self.stream
):
logger.warn(f"Execute returned false") logger.warn(f"Execute returned false")
# Transfer predictions back from the GPU. # Transfer predictions back from the GPU.
[cuda.cuMemcpyDtoH(out.host, out.device, out.nbytes) for out in self.outputs] [
cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
for out in self.outputs
]
# Synchronize the stream # Synchronize the stream
# cuda.cuStreamSynchronize(self.stream) cuda.cuStreamSynchronize(self.stream)
# Pop CUDA Context
cuda.cuCtxPopCurrent()
# Return only the host outputs. # Return only the host outputs.
return [ return [
np.array( np.array(
@ -193,9 +209,18 @@ class TensorRtDetector(DetectionApi):
assert ( assert (
cuda_err == cuda.CUresult.CUDA_SUCCESS cuda_err == cuda.CUresult.CUDA_SUCCESS
), f"Failed to initialize cuda {cuda_err}" ), f"Failed to initialize cuda {cuda_err}"
err, self.cu_ctx = cuda.cuCtxCreate(cuda.CUctx_flags.CU_CTX_MAP_HOST, 0) err, dev_count = cuda.cuDeviceGetCount()
logger.debug(f"Num Available Devices: {dev_count}")
assert (
detector_config.device < dev_count
), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."
err, self.cu_ctx = cuda.cuCtxCreate(
cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device
)
self.conf_th = 0.4 ##TODO: model config parameter self.conf_th = 0.4 ##TODO: model config parameter
self.nms_threshold = 0.4 self.nms_threshold = 0.4
err, self.stream = cuda.cuStreamCreate(0)
self.trt_logger = TrtLogger() self.trt_logger = TrtLogger()
self.engine = self._load_engine(detector_config.model.path) self.engine = self._load_engine(detector_config.model.path)
self.input_shape = self._get_input_shape() self.input_shape = self._get_input_shape()
@ -206,7 +231,6 @@ class TensorRtDetector(DetectionApi):
self.inputs, self.inputs,
self.outputs, self.outputs,
self.bindings, self.bindings,
self.stream,
) = self._allocate_buffers() ) = self._allocate_buffers()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
@ -217,12 +241,14 @@ class TensorRtDetector(DetectionApi):
def __del__(self): def __del__(self):
"""Free CUDA memories.""" """Free CUDA memories."""
if self.outputs is not None:
del self.outputs del self.outputs
if self.inputs is not None:
del self.inputs del self.inputs
if self.stream is not None:
cuda.cuStreamDestroy(self.stream) cuda.cuStreamDestroy(self.stream)
del self.stream del self.stream
del self.engine del self.engine
del self.runtime
del self.context del self.context
del self.trt_logger del self.trt_logger
cuda.cuCtxDestroy(self.cu_ctx) cuda.cuCtxDestroy(self.cu_ctx)