diff --git a/docker/install_tensorrt.sh b/docker/install_tensorrt.sh index ff84de118..faeb00a83 100755 --- a/docker/install_tensorrt.sh +++ b/docker/install_tensorrt.sh @@ -2,10 +2,10 @@ set -euxo pipefail -echo "${CUDA_LIB_VERSION:=11.8}" -echo "${CUDA_PKG_VERSION:=11-8}" +echo "${CUDA_LIB_VERSION:=11.7}" +echo "${CUDA_PKG_VERSION:=11-7}" echo "${CUDNN_VERSION:=8.6.0.84}" -echo "${TENSORRT_VERSION:=8.5.1}" +echo "${TENSORRT_VERSION:=8.4.1}" # Add NVidia Repo apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common diff --git a/frigate/detectors/tensorrt.py b/frigate/detectors/tensorrt.py index 32289fbff..dfbcdeaf1 100644 --- a/frigate/detectors/tensorrt.py +++ b/frigate/detectors/tensorrt.py @@ -5,7 +5,7 @@ import logging import ctypes import numpy as np import tensorrt as trt -import cuda as cuda +from cuda import cuda as cuda # import pycuda.driver as cuda # from .object_detector import ObjectDetector @@ -28,9 +28,10 @@ logger = logging.getLogger(__name__) class HostDeviceMem(object): """Simple helper data class that's a little nicer to use than a 2-tuple.""" - def __init__(self, host_mem, device_mem): + def __init__(self, host_mem, device_mem, nbytes): self.host = host_mem self.device = device_mem + self.nbytes = nbytes def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) @@ -38,10 +39,29 @@ class HostDeviceMem(object): def __repr__(self): return self.__str__() + def __del__(self): + cuda.cuMemFreeHost(self.host) + cuda.cuMemFree(self.device) + class TensorRtDetector(DetectionApi): # class LocalObjectDetector(ObjectDetector): def _load_engine(self, model_path): + try: + ctypes.cdll.LoadLibrary( + "/usr/local/lib/python3.9/dist-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0" + ) + ctypes.cdll.LoadLibrary( + "/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8" + ) + ctypes.cdll.LoadLibrary( + "/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so" + ) + except OSError as e: + logger.error( + "ERROR: failed to load libraries. %s", + e, + ) with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime: return runtime.deserialize_cuda_engine(f.read()) @@ -65,7 +85,7 @@ class TensorRtDetector(DetectionApi): outputs = [] bindings = [] output_idx = 0 - stream = cuda.cuStream() + err, stream = cuda.cuStreamCreate(0) for binding in self.engine: binding_dims = self.engine.get_binding_shape(binding) if len(binding_dims) == 4: @@ -78,23 +98,28 @@ class TensorRtDetector(DetectionApi): raise ValueError( "bad dims of binding %s: %s" % (binding, str(binding_dims)) ) - dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + nbytes = ( + size + * np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize + ) # Allocate host and device buffers - host_mem = cuda.pagelocked_empty(size, dtype) - device_mem = cuda.mem_alloc(host_mem.nbytes) + err, host_mem = cuda.cuMemAllocHost(nbytes) + assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}" + err, device_mem = cuda.cuMemAlloc(nbytes) + assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}" # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if self.engine.binding_is_input(binding): - inputs.append(HostDeviceMem(host_mem, device_mem)) + inputs.append(HostDeviceMem(host_mem, device_mem, nbytes)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values - assert size % 7 == 0 - outputs.append(HostDeviceMem(host_mem, device_mem)) + assert size % 7 == 0, f"output size was {size}" + outputs.append(HostDeviceMem(host_mem, device_mem, nbytes)) output_idx += 1 - assert len(inputs) == 1 - assert len(outputs) == 1 + assert len(inputs) == 1, f"inputs len was {len(inputs)}" + assert len(outputs) == 1, f"output len was {len(outputs)}" return inputs, outputs, bindings, stream def _do_inference(self): @@ -105,16 +130,16 @@ class TensorRtDetector(DetectionApi): """ # Transfer input data to the GPU. [ - cuda.memcpy_htod_async(inp.device, inp.host, self.stream) + cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream) for inp in self.inputs ] # Run inference. self.context.execute_async_v2( - bindings=self.bindings, stream_handle=self.stream.handle + bindings=self.bindings, stream_handle=self.stream.getPtr() ) # Transfer predictions back from the GPU. [ - cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream) for out in self.outputs ] # Synchronize the stream @@ -150,6 +175,7 @@ class TensorRtDetector(DetectionApi): """Free CUDA memories.""" del self.outputs del self.inputs + cuda.cuStreamDestroy(self.stream) del self.stream def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold): @@ -183,10 +209,10 @@ class TensorRtDetector(DetectionApi): # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right] # transform [height, width, 3] into (3, H, W) - tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32) + # tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32) # normalize - tensor_input /= 255.0 + # tensor_input /= 255.0 self.inputs[0].host = np.ascontiguousarray(tensor_input) trt_outputs = self._do_inference() diff --git a/requirements-tensorrt.txt b/requirements-tensorrt.txt index 134960075..7acf0222b 100644 --- a/requirements-tensorrt.txt +++ b/requirements-tensorrt.txt @@ -1,9 +1,10 @@ -cuda-python == 11.8.* -tensorrt == 8.5.* -nvidia-cuda-runtime-cu11 == 11.8.* -nvidia-cublas-cu11 == 11.11.* -nvidia-cudnn-cu11 == 8.7.* -pyindex-nvidia -polygraphy-trtexec -# tensorflow -# easydict \ No newline at end of file +nvidia-pyindex; platform_machine == 'x86_64' +nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64' +cuda-python == 11.7; platform_machine == 'x86_64' +cython == 0.29.*; platform_machine == 'x86_64' +nvidia-cuda-runtime-cu11 == 11.7.*; platform_machine == 'x86_64' +nvidia-cublas-cu11 == 11.10.*; platform_machine == 'x86_64' +nvidia-cudnn-cu11 == 8.4.*; platform_machine == 'x86_64' +polygraphy +tensorflow +easydict \ No newline at end of file diff --git a/requirements-wheels.txt b/requirements-wheels.txt index 395d2a5e6..daeacfe48 100644 --- a/requirements-wheels.txt +++ b/requirements-wheels.txt @@ -24,9 +24,14 @@ zeroconf == 0.39.4 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l' -# NVidia TensorRT Support -cuda-python == 11.8.*; platform_machine == 'x86_64' -tensorrt == 8.5.*; platform_machine == 'x86_64' -nvidia-cuda-runtime-cu11 == 11.8.*; platform_machine == 'x86_64' -nvidia-cublas-cu11 == 11.11.*; platform_machine == 'x86_64' -nvidia-cudnn-cu11 == 8.7.*; platform_machine == 'x86_64' \ No newline at end of file +# NVidia TensorRT Support (amd64 only) +nvidia-pyindex; platform_machine == 'x86_64' +nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64' +cuda-python == 11.7; platform_machine == 'x86_64' +cython == 0.29.*; platform_machine == 'x86_64' +nvidia-cuda-runtime-cu11 == 2022.4.25; platform_machine == 'x86_64' +nvidia-cuda-runtime-cu117 == 11.7.*; platform_machine == 'x86_64' +nvidia-cublas-cu11 == 2022.4.8; platform_machine == 'x86_64' +nvidia-cublas-cu117 == 11.10.*; platform_machine == 'x86_64' +nvidia-cudnn-cu11 == 2022.5.19; platform_machine == 'x86_64' +nvidia-cudnn-cu116 == 8.4.1*; platform_machine == 'x86_64' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d3679d115..546b16fb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ scikit-build == 0.14.1 +nvidia-pyindex \ No newline at end of file