mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-02-03 09:45:22 +03:00
Add config for selecting GPU. Fix Async inference. Update documentation.
This commit is contained in:
parent
de251c2c21
commit
bd4983d203
@ -155,7 +155,19 @@ NVidia GPUs may be used for object detection using the TensorRT libraries.
|
|||||||
|
|
||||||
### Minimum Hardware Support
|
### Minimum Hardware Support
|
||||||
|
|
||||||
**TODO**
|
The TensorRT detector uses the 11.x series of CUDA libraries which have minor version compatibility. The minimum driver version on the host system must be `>=450.80.02`. Also the GPU must support a Compute Capability of `5.0` or greater. This generally correlates to a Maxwell-era GPU or newer, check the NVIDIA GPU Compute Capability table linked below.
|
||||||
|
|
||||||
|
> **TODO:** NVidia claims support on compute 3.5 and 3.7, but marks it as deprecated. This would have some, but not all, Kepler GPUs as possibly working. This needs testing before making any claims of support.
|
||||||
|
|
||||||
|
There are improved capabilities in newer GPU architectures that TensorRT can benefit from, such as INT8 operations and Tensor cores. The features compatible with your hardware will be optimized when the model is converted to a trt file. Currently the script provided for generating the model provides a switch to enable/disable FP16 operations. If you wish to use newer features such as INT8 optimization, more work is required.
|
||||||
|
|
||||||
|
#### Compatibility References:
|
||||||
|
|
||||||
|
[NVIDIA TensorRT Support Matrix](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-841/support-matrix/index.html)
|
||||||
|
|
||||||
|
[NVIDIA CUDA Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
|
||||||
|
|
||||||
|
[NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
### Generate Models
|
### Generate Models
|
||||||
|
|
||||||
@ -165,7 +177,7 @@ To generate the model files, create a new folder to save the models, download th
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir trt-models
|
mkdir trt-models
|
||||||
wget https://github.com/blakeblackshear/frigate/raw/master/docker/tensorrt_models.sh
|
wget https://raw.githubusercontent.com/blakeblackshear/frigate/nvidia-detector/docker/tensorrt_models.sh
|
||||||
chmod +x tensorrt_models.sh
|
chmod +x tensorrt_models.sh
|
||||||
docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh
|
docker run --gpus=all --rm -it -v `pwd`/trt-models:/tensorrt_models -v `pwd`/tensorrt_models.sh:/tensorrt_models.sh nvcr.io/nvidia/tensorrt:22.07-py3 /tensorrt_models.sh
|
||||||
```
|
```
|
||||||
@ -202,19 +214,21 @@ yolov7-tiny-416
|
|||||||
|
|
||||||
### Configuration Parameters
|
### Configuration Parameters
|
||||||
|
|
||||||
**TODO**
|
The TensorRT detector can be selected by specifying `tensorrt` as the model type. The GPU will need to be passed through to the docker container using the same methods described in the [Hardware Acceleration](hardware_acceleration.md#nvidia-gpu) section. If you pass through multiple GPUs, you can select which GPU is used for a detector with the `device` configuration parameter. The `device` parameter is an integer value of the GPU index, as shown by `nvidia-smi` within the container.
|
||||||
|
|
||||||
Sample:
|
The TensorRT detector uses `.trt` model files that are located in `/trt-models/` by default. These model file path and dimensions used will depend on which model you have generated.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
detectors:
|
detectors:
|
||||||
tensorrt:
|
tensorrt:
|
||||||
type: tensorrt
|
type: tensorrt
|
||||||
|
device: 0 #This is the default, select the first GPU
|
||||||
|
|
||||||
model:
|
model:
|
||||||
path: /trt-models/yolov7-tiny-416.trt
|
path: /trt-models/yolov7-tiny-416.trt
|
||||||
labelmap_path: /trt-models/coco_91cl.txt
|
labelmap_path: /trt-models/coco_91cl.txt
|
||||||
input_tensor: nchw
|
input_tensor: nchw
|
||||||
|
input_pixel_format: rgb
|
||||||
width: 416
|
width: 416
|
||||||
height: 416
|
height: 416
|
||||||
```
|
```
|
||||||
|
|||||||
@ -46,7 +46,7 @@ if TRT_SUPPORT:
|
|||||||
|
|
||||||
class TensorRTDetectorConfig(BaseDetectorConfig):
|
class TensorRTDetectorConfig(BaseDetectorConfig):
|
||||||
type: Literal[DETECTOR_KEY]
|
type: Literal[DETECTOR_KEY]
|
||||||
device: str = Field(default=None, title="Device Type")
|
device: int = Field(default=0, title="GPU Device Index")
|
||||||
|
|
||||||
|
|
||||||
class HostDeviceMem(object):
|
class HostDeviceMem(object):
|
||||||
@ -90,9 +90,8 @@ class TensorRtDetector(DetectionApi):
|
|||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.runtime = trt.Runtime(self.trt_logger)
|
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
|
||||||
with open(model_path, "rb") as f:
|
return runtime.deserialize_cuda_engine(f.read())
|
||||||
return self.runtime.deserialize_cuda_engine(f.read())
|
|
||||||
|
|
||||||
def _get_input_shape(self):
|
def _get_input_shape(self):
|
||||||
"""Get input shape of the TensorRT YOLO engine."""
|
"""Get input shape of the TensorRT YOLO engine."""
|
||||||
@ -120,7 +119,6 @@ class TensorRtDetector(DetectionApi):
|
|||||||
outputs = []
|
outputs = []
|
||||||
bindings = []
|
bindings = []
|
||||||
output_idx = 0
|
output_idx = 0
|
||||||
err, stream = cuda.cuStreamCreate(0)
|
|
||||||
for binding in self.engine:
|
for binding in self.engine:
|
||||||
binding_dims = self.engine.get_binding_shape(binding)
|
binding_dims = self.engine.get_binding_shape(binding)
|
||||||
if len(binding_dims) == 4:
|
if len(binding_dims) == 4:
|
||||||
@ -159,7 +157,7 @@ class TensorRtDetector(DetectionApi):
|
|||||||
output_idx += 1
|
output_idx += 1
|
||||||
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
||||||
assert len(outputs) == 1, f"output len was {len(outputs)}"
|
assert len(outputs) == 1, f"output len was {len(outputs)}"
|
||||||
return inputs, outputs, bindings, stream
|
return inputs, outputs, bindings
|
||||||
|
|
||||||
def _do_inference(self):
|
def _do_inference(self):
|
||||||
"""do_inference (for TensorRT 7.0+)
|
"""do_inference (for TensorRT 7.0+)
|
||||||
@ -167,15 +165,33 @@ class TensorRtDetector(DetectionApi):
|
|||||||
dimension networks.
|
dimension networks.
|
||||||
Inputs and outputs are expected to be lists of HostDeviceMem objects.
|
Inputs and outputs are expected to be lists of HostDeviceMem objects.
|
||||||
"""
|
"""
|
||||||
|
# Push CUDA Context
|
||||||
|
cuda.cuCtxPushCurrent(self.cu_ctx)
|
||||||
|
|
||||||
# Transfer input data to the GPU.
|
# Transfer input data to the GPU.
|
||||||
[cuda.cuMemcpyHtoD(inp.device, inp.host, inp.nbytes) for inp in self.inputs]
|
[
|
||||||
|
cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
|
||||||
|
for inp in self.inputs
|
||||||
|
]
|
||||||
|
|
||||||
# Run inference.
|
# Run inference.
|
||||||
if not self.context.execute_v2(bindings=self.bindings):
|
if not self.context.execute_async_v2(
|
||||||
|
bindings=self.bindings, stream_handle=self.stream
|
||||||
|
):
|
||||||
logger.warn(f"Execute returned false")
|
logger.warn(f"Execute returned false")
|
||||||
|
|
||||||
# Transfer predictions back from the GPU.
|
# Transfer predictions back from the GPU.
|
||||||
[cuda.cuMemcpyDtoH(out.host, out.device, out.nbytes) for out in self.outputs]
|
[
|
||||||
|
cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
|
||||||
|
for out in self.outputs
|
||||||
|
]
|
||||||
|
|
||||||
# Synchronize the stream
|
# Synchronize the stream
|
||||||
# cuda.cuStreamSynchronize(self.stream)
|
cuda.cuStreamSynchronize(self.stream)
|
||||||
|
|
||||||
|
# Pop CUDA Context
|
||||||
|
cuda.cuCtxPopCurrent()
|
||||||
|
|
||||||
# Return only the host outputs.
|
# Return only the host outputs.
|
||||||
return [
|
return [
|
||||||
np.array(
|
np.array(
|
||||||
@ -193,9 +209,18 @@ class TensorRtDetector(DetectionApi):
|
|||||||
assert (
|
assert (
|
||||||
cuda_err == cuda.CUresult.CUDA_SUCCESS
|
cuda_err == cuda.CUresult.CUDA_SUCCESS
|
||||||
), f"Failed to initialize cuda {cuda_err}"
|
), f"Failed to initialize cuda {cuda_err}"
|
||||||
err, self.cu_ctx = cuda.cuCtxCreate(cuda.CUctx_flags.CU_CTX_MAP_HOST, 0)
|
err, dev_count = cuda.cuDeviceGetCount()
|
||||||
|
logger.debug(f"Num Available Devices: {dev_count}")
|
||||||
|
assert (
|
||||||
|
detector_config.device < dev_count
|
||||||
|
), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."
|
||||||
|
err, self.cu_ctx = cuda.cuCtxCreate(
|
||||||
|
cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device
|
||||||
|
)
|
||||||
|
|
||||||
self.conf_th = 0.4 ##TODO: model config parameter
|
self.conf_th = 0.4 ##TODO: model config parameter
|
||||||
self.nms_threshold = 0.4
|
self.nms_threshold = 0.4
|
||||||
|
err, self.stream = cuda.cuStreamCreate(0)
|
||||||
self.trt_logger = TrtLogger()
|
self.trt_logger = TrtLogger()
|
||||||
self.engine = self._load_engine(detector_config.model.path)
|
self.engine = self._load_engine(detector_config.model.path)
|
||||||
self.input_shape = self._get_input_shape()
|
self.input_shape = self._get_input_shape()
|
||||||
@ -206,7 +231,6 @@ class TensorRtDetector(DetectionApi):
|
|||||||
self.inputs,
|
self.inputs,
|
||||||
self.outputs,
|
self.outputs,
|
||||||
self.bindings,
|
self.bindings,
|
||||||
self.stream,
|
|
||||||
) = self._allocate_buffers()
|
) = self._allocate_buffers()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
@ -217,12 +241,14 @@ class TensorRtDetector(DetectionApi):
|
|||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
"""Free CUDA memories."""
|
"""Free CUDA memories."""
|
||||||
del self.outputs
|
if self.outputs is not None:
|
||||||
del self.inputs
|
del self.outputs
|
||||||
cuda.cuStreamDestroy(self.stream)
|
if self.inputs is not None:
|
||||||
del self.stream
|
del self.inputs
|
||||||
|
if self.stream is not None:
|
||||||
|
cuda.cuStreamDestroy(self.stream)
|
||||||
|
del self.stream
|
||||||
del self.engine
|
del self.engine
|
||||||
del self.runtime
|
|
||||||
del self.context
|
del self.context
|
||||||
del self.trt_logger
|
del self.trt_logger
|
||||||
cuda.cuCtxDestroy(self.cu_ctx)
|
cuda.cuCtxDestroy(self.cu_ctx)
|
||||||
@ -257,7 +283,7 @@ class TensorRtDetector(DetectionApi):
|
|||||||
# normalize
|
# normalize
|
||||||
if self.input_shape[-1] != trt.int8:
|
if self.input_shape[-1] != trt.int8:
|
||||||
tensor_input = tensor_input.astype(self.input_shape[-1])
|
tensor_input = tensor_input.astype(self.input_shape[-1])
|
||||||
tensor_input /= 255.0
|
tensor_input /= 255.0
|
||||||
|
|
||||||
self.inputs[0].host = np.ascontiguousarray(
|
self.inputs[0].host = np.ascontiguousarray(
|
||||||
tensor_input.astype(self.input_shape[-1])
|
tensor_input.astype(self.input_shape[-1])
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user