Updates to detector for cuda python library

2026-02-03 09:45:22 +03:00 · 2022-12-12 01:40:42 -05:00 · 2022-12-12 01:40:42 -05:00 · ccc1218cd5
commit ccc1218cd5
parent 4b562add75
5 changed files with 67 additions and 34 deletions
--- a/docker/install_tensorrt.sh
+++ b/docker/install_tensorrt.sh
@ -2,10 +2,10 @@

 set -euxo pipefail

-echo "${CUDA_LIB_VERSION:=11.8}"
-echo "${CUDA_PKG_VERSION:=11-8}"
+echo "${CUDA_LIB_VERSION:=11.7}"
+echo "${CUDA_PKG_VERSION:=11-7}"
 echo "${CUDNN_VERSION:=8.6.0.84}"
-echo "${TENSORRT_VERSION:=8.5.1}"
+echo "${TENSORRT_VERSION:=8.4.1}"

 # Add NVidia Repo
 apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common
--- a/frigate/detectors/tensorrt.py
+++ b/frigate/detectors/tensorrt.py
@ -5,7 +5,7 @@ import logging
 import ctypes
 import numpy as np
 import tensorrt as trt
-import cuda as cuda
+from cuda import cuda as cuda

 # import pycuda.driver as cuda
 # from .object_detector import ObjectDetector
@ -28,9 +28,10 @@ logger = logging.getLogger(__name__)
 class HostDeviceMem(object):
    """Simple helper data class that's a little nicer to use than a 2-tuple."""

-    def __init__(self, host_mem, device_mem):
+    def __init__(self, host_mem, device_mem, nbytes):
        self.host = host_mem
        self.device = device_mem
+        self.nbytes = nbytes

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
@ -38,10 +39,29 @@ class HostDeviceMem(object):
    def __repr__(self):
        return self.__str__()

+    def __del__(self):
+        cuda.cuMemFreeHost(self.host)
+        cuda.cuMemFree(self.device)
+

 class TensorRtDetector(DetectionApi):
    # class LocalObjectDetector(ObjectDetector):
    def _load_engine(self, model_path):
+        try:
+            ctypes.cdll.LoadLibrary(
+                "/usr/local/lib/python3.9/dist-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0"
+            )
+            ctypes.cdll.LoadLibrary(
+                "/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8"
+            )
+            ctypes.cdll.LoadLibrary(
+                "/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so"
+            )
+        except OSError as e:
+            logger.error(
+                "ERROR: failed to load libraries. %s",
+                e,
+            )
        with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

@ -65,7 +85,7 @@ class TensorRtDetector(DetectionApi):
        outputs = []
        bindings = []
        output_idx = 0
-        stream = cuda.cuStream()
+        err, stream = cuda.cuStreamCreate(0)
        for binding in self.engine:
            binding_dims = self.engine.get_binding_shape(binding)
            if len(binding_dims) == 4:
@ -78,23 +98,28 @@ class TensorRtDetector(DetectionApi):
                raise ValueError(
                    "bad dims of binding %s: %s" % (binding, str(binding_dims))
                )
-            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            nbytes = (
+                size
+                * np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
+            )
            # Allocate host and device buffers
-            host_mem = cuda.pagelocked_empty(size, dtype)
-            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            err, host_mem = cuda.cuMemAllocHost(nbytes)
+            assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
+            err, device_mem = cuda.cuMemAlloc(nbytes)
+            assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
-                inputs.append(HostDeviceMem(host_mem, device_mem))
+                inputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
            else:
                # each grid has 3 anchors, each anchor generates a detection
                # output of 7 float32 values
-                assert size % 7 == 0
-                outputs.append(HostDeviceMem(host_mem, device_mem))
+                assert size % 7 == 0, f"output size was {size}"
+                outputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
                output_idx += 1
-        assert len(inputs) == 1
-        assert len(outputs) == 1
+        assert len(inputs) == 1, f"inputs len was {len(inputs)}"
+        assert len(outputs) == 1, f"output len was {len(outputs)}"
        return inputs, outputs, bindings, stream

    def _do_inference(self):
@ -105,16 +130,16 @@ class TensorRtDetector(DetectionApi):
        """
        # Transfer input data to the GPU.
        [
-            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
+            cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
            for inp in self.inputs
        ]
        # Run inference.
        self.context.execute_async_v2(
-            bindings=self.bindings, stream_handle=self.stream.handle
+            bindings=self.bindings, stream_handle=self.stream.getPtr()
        )
        # Transfer predictions back from the GPU.
        [
-            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
+            cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
            for out in self.outputs
        ]
        # Synchronize the stream
@ -150,6 +175,7 @@ class TensorRtDetector(DetectionApi):
        """Free CUDA memories."""
        del self.outputs
        del self.inputs
+        cuda.cuStreamDestroy(self.stream)
        del self.stream

    def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
@ -183,10 +209,10 @@ class TensorRtDetector(DetectionApi):
        # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]

        # transform [height, width, 3] into (3, H, W)
-        tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
+        # tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)

        # normalize
-        tensor_input /= 255.0
+        # tensor_input /= 255.0

        self.inputs[0].host = np.ascontiguousarray(tensor_input)
        trt_outputs = self._do_inference()
--- a/requirements-tensorrt.txt
+++ b/requirements-tensorrt.txt
@ -1,9 +1,10 @@
-cuda-python == 11.8.*
-tensorrt == 8.5.*
-nvidia-cuda-runtime-cu11 == 11.8.*
-nvidia-cublas-cu11 == 11.11.*
-nvidia-cudnn-cu11 == 8.7.*
-pyindex-nvidia
-polygraphy-trtexec
-# tensorflow
-# easydict
+nvidia-pyindex; platform_machine == 'x86_64'
+nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
+cuda-python == 11.7; platform_machine == 'x86_64'
+cython == 0.29.*; platform_machine == 'x86_64'
+nvidia-cuda-runtime-cu11 == 11.7.*; platform_machine == 'x86_64'
+nvidia-cublas-cu11 == 11.10.*; platform_machine == 'x86_64'
+nvidia-cudnn-cu11 == 8.4.*; platform_machine == 'x86_64'
+polygraphy
+tensorflow
+easydict
--- a/requirements-wheels.txt
+++ b/requirements-wheels.txt
@ -24,9 +24,14 @@ zeroconf == 0.39.4
 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l'
-# NVidia TensorRT Support
-cuda-python == 11.8.*; platform_machine == 'x86_64'
-tensorrt == 8.5.*; platform_machine == 'x86_64'
-nvidia-cuda-runtime-cu11 == 11.8.*; platform_machine == 'x86_64'
-nvidia-cublas-cu11 == 11.11.*; platform_machine == 'x86_64'
-nvidia-cudnn-cu11 == 8.7.*; platform_machine == 'x86_64'
+# NVidia TensorRT Support (amd64 only)
+nvidia-pyindex; platform_machine == 'x86_64'
+nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
+cuda-python == 11.7; platform_machine == 'x86_64'
+cython == 0.29.*; platform_machine == 'x86_64'
+nvidia-cuda-runtime-cu11 == 2022.4.25; platform_machine == 'x86_64'
+nvidia-cuda-runtime-cu117 == 11.7.*; platform_machine == 'x86_64'
+nvidia-cublas-cu11 == 2022.4.8; platform_machine == 'x86_64'
+nvidia-cublas-cu117 == 11.10.*; platform_machine == 'x86_64'
+nvidia-cudnn-cu11 == 2022.5.19; platform_machine == 'x86_64'
+nvidia-cudnn-cu116 == 8.4.1*; platform_machine == 'x86_64'
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
 scikit-build == 0.14.1
+nvidia-pyindex