Updates to detector for cuda python library

This commit is contained in:
Nate Meyer 2022-12-12 01:40:42 -05:00
parent 4b562add75
commit ccc1218cd5
5 changed files with 67 additions and 34 deletions

View File

@ -2,10 +2,10 @@
set -euxo pipefail set -euxo pipefail
echo "${CUDA_LIB_VERSION:=11.8}" echo "${CUDA_LIB_VERSION:=11.7}"
echo "${CUDA_PKG_VERSION:=11-8}" echo "${CUDA_PKG_VERSION:=11-7}"
echo "${CUDNN_VERSION:=8.6.0.84}" echo "${CUDNN_VERSION:=8.6.0.84}"
echo "${TENSORRT_VERSION:=8.5.1}" echo "${TENSORRT_VERSION:=8.4.1}"
# Add NVidia Repo # Add NVidia Repo
apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common

View File

@ -5,7 +5,7 @@ import logging
import ctypes import ctypes
import numpy as np import numpy as np
import tensorrt as trt import tensorrt as trt
import cuda as cuda from cuda import cuda as cuda
# import pycuda.driver as cuda # import pycuda.driver as cuda
# from .object_detector import ObjectDetector # from .object_detector import ObjectDetector
@ -28,9 +28,10 @@ logger = logging.getLogger(__name__)
class HostDeviceMem(object): class HostDeviceMem(object):
"""Simple helper data class that's a little nicer to use than a 2-tuple.""" """Simple helper data class that's a little nicer to use than a 2-tuple."""
def __init__(self, host_mem, device_mem): def __init__(self, host_mem, device_mem, nbytes):
self.host = host_mem self.host = host_mem
self.device = device_mem self.device = device_mem
self.nbytes = nbytes
def __str__(self): def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
@ -38,10 +39,29 @@ class HostDeviceMem(object):
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
def __del__(self):
cuda.cuMemFreeHost(self.host)
cuda.cuMemFree(self.device)
class TensorRtDetector(DetectionApi): class TensorRtDetector(DetectionApi):
# class LocalObjectDetector(ObjectDetector): # class LocalObjectDetector(ObjectDetector):
def _load_engine(self, model_path): def _load_engine(self, model_path):
try:
ctypes.cdll.LoadLibrary(
"/usr/local/lib/python3.9/dist-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0"
)
ctypes.cdll.LoadLibrary(
"/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8"
)
ctypes.cdll.LoadLibrary(
"/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so"
)
except OSError as e:
logger.error(
"ERROR: failed to load libraries. %s",
e,
)
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime: with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read()) return runtime.deserialize_cuda_engine(f.read())
@ -65,7 +85,7 @@ class TensorRtDetector(DetectionApi):
outputs = [] outputs = []
bindings = [] bindings = []
output_idx = 0 output_idx = 0
stream = cuda.cuStream() err, stream = cuda.cuStreamCreate(0)
for binding in self.engine: for binding in self.engine:
binding_dims = self.engine.get_binding_shape(binding) binding_dims = self.engine.get_binding_shape(binding)
if len(binding_dims) == 4: if len(binding_dims) == 4:
@ -78,23 +98,28 @@ class TensorRtDetector(DetectionApi):
raise ValueError( raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims)) "bad dims of binding %s: %s" % (binding, str(binding_dims))
) )
dtype = trt.nptype(self.engine.get_binding_dtype(binding)) nbytes = (
size
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
)
# Allocate host and device buffers # Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype) err, host_mem = cuda.cuMemAllocHost(nbytes)
device_mem = cuda.mem_alloc(host_mem.nbytes) assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
err, device_mem = cuda.cuMemAlloc(nbytes)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
# Append the device buffer to device bindings. # Append the device buffer to device bindings.
bindings.append(int(device_mem)) bindings.append(int(device_mem))
# Append to the appropriate list. # Append to the appropriate list.
if self.engine.binding_is_input(binding): if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem)) inputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
else: else:
# each grid has 3 anchors, each anchor generates a detection # each grid has 3 anchors, each anchor generates a detection
# output of 7 float32 values # output of 7 float32 values
assert size % 7 == 0 assert size % 7 == 0, f"output size was {size}"
outputs.append(HostDeviceMem(host_mem, device_mem)) outputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
output_idx += 1 output_idx += 1
assert len(inputs) == 1 assert len(inputs) == 1, f"inputs len was {len(inputs)}"
assert len(outputs) == 1 assert len(outputs) == 1, f"output len was {len(outputs)}"
return inputs, outputs, bindings, stream return inputs, outputs, bindings, stream
def _do_inference(self): def _do_inference(self):
@ -105,16 +130,16 @@ class TensorRtDetector(DetectionApi):
""" """
# Transfer input data to the GPU. # Transfer input data to the GPU.
[ [
cuda.memcpy_htod_async(inp.device, inp.host, self.stream) cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
for inp in self.inputs for inp in self.inputs
] ]
# Run inference. # Run inference.
self.context.execute_async_v2( self.context.execute_async_v2(
bindings=self.bindings, stream_handle=self.stream.handle bindings=self.bindings, stream_handle=self.stream.getPtr()
) )
# Transfer predictions back from the GPU. # Transfer predictions back from the GPU.
[ [
cuda.memcpy_dtoh_async(out.host, out.device, self.stream) cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
for out in self.outputs for out in self.outputs
] ]
# Synchronize the stream # Synchronize the stream
@ -150,6 +175,7 @@ class TensorRtDetector(DetectionApi):
"""Free CUDA memories.""" """Free CUDA memories."""
del self.outputs del self.outputs
del self.inputs del self.inputs
cuda.cuStreamDestroy(self.stream)
del self.stream del self.stream
def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold): def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
@ -183,10 +209,10 @@ class TensorRtDetector(DetectionApi):
# 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right] # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]
# transform [height, width, 3] into (3, H, W) # transform [height, width, 3] into (3, H, W)
tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32) # tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
# normalize # normalize
tensor_input /= 255.0 # tensor_input /= 255.0
self.inputs[0].host = np.ascontiguousarray(tensor_input) self.inputs[0].host = np.ascontiguousarray(tensor_input)
trt_outputs = self._do_inference() trt_outputs = self._do_inference()

View File

@ -1,9 +1,10 @@
cuda-python == 11.8.* nvidia-pyindex; platform_machine == 'x86_64'
tensorrt == 8.5.* nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
nvidia-cuda-runtime-cu11 == 11.8.* cuda-python == 11.7; platform_machine == 'x86_64'
nvidia-cublas-cu11 == 11.11.* cython == 0.29.*; platform_machine == 'x86_64'
nvidia-cudnn-cu11 == 8.7.* nvidia-cuda-runtime-cu11 == 11.7.*; platform_machine == 'x86_64'
pyindex-nvidia nvidia-cublas-cu11 == 11.10.*; platform_machine == 'x86_64'
polygraphy-trtexec nvidia-cudnn-cu11 == 8.4.*; platform_machine == 'x86_64'
# tensorflow polygraphy
# easydict tensorflow
easydict

View File

@ -24,9 +24,14 @@ zeroconf == 0.39.4
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l'
# NVidia TensorRT Support # NVidia TensorRT Support (amd64 only)
cuda-python == 11.8.*; platform_machine == 'x86_64' nvidia-pyindex; platform_machine == 'x86_64'
tensorrt == 8.5.*; platform_machine == 'x86_64' nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
nvidia-cuda-runtime-cu11 == 11.8.*; platform_machine == 'x86_64' cuda-python == 11.7; platform_machine == 'x86_64'
nvidia-cublas-cu11 == 11.11.*; platform_machine == 'x86_64' cython == 0.29.*; platform_machine == 'x86_64'
nvidia-cudnn-cu11 == 8.7.*; platform_machine == 'x86_64' nvidia-cuda-runtime-cu11 == 2022.4.25; platform_machine == 'x86_64'
nvidia-cuda-runtime-cu117 == 11.7.*; platform_machine == 'x86_64'
nvidia-cublas-cu11 == 2022.4.8; platform_machine == 'x86_64'
nvidia-cublas-cu117 == 11.10.*; platform_machine == 'x86_64'
nvidia-cudnn-cu11 == 2022.5.19; platform_machine == 'x86_64'
nvidia-cudnn-cu116 == 8.4.1*; platform_machine == 'x86_64'

View File

@ -1 +1,2 @@
scikit-build == 0.14.1 scikit-build == 0.14.1
nvidia-pyindex