Add config for selecting GPU. Fix Async inference. Update documentation.

This commit is contained in:
Nate Meyer 2022-12-29 12:26:29 -05:00
parent de251c2c21
commit bd4983d203
2 changed files with 62 additions and 22 deletions

View File

@ -155,7 +155,19 @@ NVidia GPUs may be used for object detection using the TensorRT libraries.
### Minimum Hardware Support
**TODO**
The TensorRT detector uses the 11.x series of CUDA libraries which have minor version compatibility. The minimum driver version on the host system must be `>=450.80.02`. Also the GPU must support a Compute Capability of `5.0` or greater. This generally correlates to a Maxwell-era GPU or newer, check the NVIDIA GPU Compute Capability table linked below.
> **TODO:** NVidia claims support on compute 3.5 and 3.7, but marks it as deprecated. This would have some, but not all, Kepler GPUs as possibly working. This needs testing before making any claims of support.
There are improved capabilities in newer GPU architectures that TensorRT can benefit from, such as INT8 operations and Tensor cores. The features compatible with your hardware will be optimized when the model is converted to a trt file. Currently the script provided for generating the model provides a switch to enable/disable FP16 operations. If you wish to use newer features such as INT8 optimization, more work is required.
#### Compatibility References:
[NVIDIA TensorRT Support Matrix](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-841/support-matrix/index.html)
[NVIDIA CUDA Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
[NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
### Generate Models
@ -165,7 +177,7 @@ To generate the model files, create a new folder to save the models, download th
```bash
mkdir trt-models
wget https://github.com/blakeblackshear/frigate/raw/master/docker/tensorrt_models.sh
wget https://raw.githubusercontent.com/blakeblackshear/frigate/nvidia-detector/docker/tensorrt_models.sh
chmod +x tensorrt_models.sh
docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh
```
@ -202,19 +214,21 @@ yolov7-tiny-416
### Configuration Parameters
**TODO**
The TensorRT detector can be selected by specifying `tensorrt` as the model type. The GPU will need to be passed through to the docker container using the same methods described in the [Hardware Acceleration](hardware_acceleration.md#nvidia-gpu) section. If you pass through multiple GPUs, you can select which GPU is used for a detector with the `device` configuration parameter. The `device` parameter is an integer value of the GPU index, as shown by `nvidia-smi` within the container.
Sample:
The TensorRT detector uses `.trt` model files that are located in `/trt-models/` by default. These model file path and dimensions used will depend on which model you have generated.
```yaml
detectors:
tensorrt:
type: tensorrt
device: 0 #This is the default, select the first GPU
model:
path: /trt-models/yolov7-tiny-416.trt
labelmap_path: /trt-models/coco_91cl.txt
input_tensor: nchw
input_pixel_format: rgb
width: 416
height: 416
```

View File

@ -46,7 +46,7 @@ if TRT_SUPPORT:
class TensorRTDetectorConfig(BaseDetectorConfig):
type: Literal[DETECTOR_KEY]
device: str = Field(default=None, title="Device Type")
device: int = Field(default=0, title="GPU Device Index")
class HostDeviceMem(object):
@ -90,9 +90,8 @@ class TensorRtDetector(DetectionApi):
e,
)
self.runtime = trt.Runtime(self.trt_logger)
with open(model_path, "rb") as f:
return self.runtime.deserialize_cuda_engine(f.read())
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def _get_input_shape(self):
"""Get input shape of the TensorRT YOLO engine."""
@ -120,7 +119,6 @@ class TensorRtDetector(DetectionApi):
outputs = []
bindings = []
output_idx = 0
err, stream = cuda.cuStreamCreate(0)
for binding in self.engine:
binding_dims = self.engine.get_binding_shape(binding)
if len(binding_dims) == 4:
@ -159,7 +157,7 @@ class TensorRtDetector(DetectionApi):
output_idx += 1
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
assert len(outputs) == 1, f"output len was {len(outputs)}"
return inputs, outputs, bindings, stream
return inputs, outputs, bindings
def _do_inference(self):
"""do_inference (for TensorRT 7.0+)
@ -167,15 +165,33 @@ class TensorRtDetector(DetectionApi):
dimension networks.
Inputs and outputs are expected to be lists of HostDeviceMem objects.
"""
# Push CUDA Context
cuda.cuCtxPushCurrent(self.cu_ctx)
# Transfer input data to the GPU.
[cuda.cuMemcpyHtoD(inp.device, inp.host, inp.nbytes) for inp in self.inputs]
[
cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
for inp in self.inputs
]
# Run inference.
if not self.context.execute_v2(bindings=self.bindings):
if not self.context.execute_async_v2(
bindings=self.bindings, stream_handle=self.stream
):
logger.warn(f"Execute returned false")
# Transfer predictions back from the GPU.
[cuda.cuMemcpyDtoH(out.host, out.device, out.nbytes) for out in self.outputs]
[
cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
for out in self.outputs
]
# Synchronize the stream
# cuda.cuStreamSynchronize(self.stream)
cuda.cuStreamSynchronize(self.stream)
# Pop CUDA Context
cuda.cuCtxPopCurrent()
# Return only the host outputs.
return [
np.array(
@ -193,9 +209,18 @@ class TensorRtDetector(DetectionApi):
assert (
cuda_err == cuda.CUresult.CUDA_SUCCESS
), f"Failed to initialize cuda {cuda_err}"
err, self.cu_ctx = cuda.cuCtxCreate(cuda.CUctx_flags.CU_CTX_MAP_HOST, 0)
err, dev_count = cuda.cuDeviceGetCount()
logger.debug(f"Num Available Devices: {dev_count}")
assert (
detector_config.device < dev_count
), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."
err, self.cu_ctx = cuda.cuCtxCreate(
cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device
)
self.conf_th = 0.4 ##TODO: model config parameter
self.nms_threshold = 0.4
err, self.stream = cuda.cuStreamCreate(0)
self.trt_logger = TrtLogger()
self.engine = self._load_engine(detector_config.model.path)
self.input_shape = self._get_input_shape()
@ -206,7 +231,6 @@ class TensorRtDetector(DetectionApi):
self.inputs,
self.outputs,
self.bindings,
self.stream,
) = self._allocate_buffers()
except Exception as e:
logger.error(e)
@ -217,12 +241,14 @@ class TensorRtDetector(DetectionApi):
def __del__(self):
"""Free CUDA memories."""
if self.outputs is not None:
del self.outputs
if self.inputs is not None:
del self.inputs
if self.stream is not None:
cuda.cuStreamDestroy(self.stream)
del self.stream
del self.engine
del self.runtime
del self.context
del self.trt_logger
cuda.cuCtxDestroy(self.cu_ctx)