From 0ef7bfbbb0d57967cd96b3d8da654381bf43b279 Mon Sep 17 00:00:00 2001 From: Nate Meyer Date: Fri, 9 Dec 2022 14:36:36 -0500 Subject: [PATCH] Add tensorRT detector --- Dockerfile | 5 +- docker-compose.yml | 7 ++ frigate/detectors/tensorrt.py | 226 ++++++++++++++++++++++++++++++++++ requirements-tensorrt.txt | 5 + requirements-wheels.txt | 2 + 5 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 frigate/detectors/tensorrt.py create mode 100644 requirements-tensorrt.txt diff --git a/Dockerfile b/Dockerfile index 26c822e47..92608a133 100644 --- a/Dockerfile +++ b/Dockerfile @@ -140,7 +140,7 @@ RUN /bin/mkdir -p '/usr/local/lib' && \ # /build/build_pycuda.sh # Download and Convert TensorRT Model -FROM base_amd64 as tensorrt-converter +# FROM base_amd64 as tensorrt-converter ## TODO @@ -188,7 +188,8 @@ RUN apt-get -qq update \ libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev \ libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev \ # scipy dependencies - gcc gfortran libopenblas-dev liblapack-dev + gcc gfortran libopenblas-dev liblapack-dev && \ + rm -rf /var/lib/apt/lists/* RUN wget -q https://bootstrap.pypa.io/get-pip.py -O get-pip.py \ && python3 get-pip.py "pip" diff --git a/docker-compose.yml b/docker-compose.yml index 3ed08493c..bde0ca63a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,13 @@ services: build: context: . target: devcontainer + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] devices: - /dev/bus/usb:/dev/bus/usb # - /dev/dri:/dev/dri # for intel hwaccel, needs to be updated for your hardware diff --git a/frigate/detectors/tensorrt.py b/frigate/detectors/tensorrt.py new file mode 100644 index 000000000..8e4b66aa0 --- /dev/null +++ b/frigate/detectors/tensorrt.py @@ -0,0 +1,226 @@ +import logging + +# from frigate.config import DetectorConfig, DetectorTypeEnum +# from frigate.util import EventsPerSecond +import ctypes +import numpy as np +import tensorrt as trt +import cuda as cuda + +# import pycuda.driver as cuda +# from .object_detector import ObjectDetector +# import pycuda.autoinit # This is needed for initializing CUDA driver +from frigate.detectors.detection_api import DetectionApi + +logger = logging.getLogger(__name__) + + +# def object_detector_factory(detector_config: DetectorConfig, model_path: str): +# if detector_config.type != DetectorTypeEnum.tensorrt: +# return None +# try: +# ctypes.cdll.LoadLibrary("/yolo4/libyolo_layer.so") +# except OSError as e: +# logger.error("ERROR: failed to load /yolo4/libyolo_layer.so. %s", e) +# return LocalObjectDetector(detector_config, model_path) + + +class HostDeviceMem(object): + """Simple helper data class that's a little nicer to use than a 2-tuple.""" + + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + +class TensorRtDetector(DetectionApi): + # class LocalObjectDetector(ObjectDetector): + def _load_engine(self, model_path): + with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def _get_input_shape(self): + """Get input shape of the TensorRT YOLO engine.""" + binding = self.engine[0] + assert self.engine.binding_is_input(binding) + binding_dims = self.engine.get_binding_shape(binding) + if len(binding_dims) == 4: + return tuple(binding_dims[2:]) + elif len(binding_dims) == 3: + return tuple(binding_dims[1:]) + else: + raise ValueError( + "bad dims of binding %s: %s" % (binding, str(binding_dims)) + ) + + def _allocate_buffers(self): + """Allocates all host/device in/out buffers required for an engine.""" + inputs = [] + outputs = [] + bindings = [] + output_idx = 0 + stream = cuda.Stream() + for binding in self.engine: + binding_dims = self.engine.get_binding_shape(binding) + if len(binding_dims) == 4: + # explicit batch case (TensorRT 7+) + size = trt.volume(binding_dims) + elif len(binding_dims) == 3: + # implicit batch case (TensorRT 6 or older) + size = trt.volume(binding_dims) * self.engine.max_batch_size + else: + raise ValueError( + "bad dims of binding %s: %s" % (binding, str(binding_dims)) + ) + dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if self.engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + # each grid has 3 anchors, each anchor generates a detection + # output of 7 float32 values + assert size % 7 == 0 + outputs.append(HostDeviceMem(host_mem, device_mem)) + output_idx += 1 + assert len(inputs) == 1 + assert len(outputs) == 1 + return inputs, outputs, bindings, stream + + def _do_inference(self): + """do_inference (for TensorRT 7.0+) + This function is generalized for multiple inputs/outputs for full + dimension networks. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [ + cuda.memcpy_htod_async(inp.device, inp.host, self.stream) + for inp in self.inputs + ] + # Run inference. + self.context.execute_async_v2( + bindings=self.bindings, stream_handle=self.stream.handle + ) + # Transfer predictions back from the GPU. + [ + cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + for out in self.outputs + ] + # Synchronize the stream + self.stream.synchronize() + # Return only the host outputs. + return [out.host for out in self.outputs] + + def __init__(self, det_device=None, model_config=None, num_threads=1): + # def __init__(self, detector_config: DetectorConfig, model_path: str): + # self.fps = EventsPerSecond() + self.conf_th = 0.4 ##TODO: model config parameter + self.nms_threshold = 0.4 + self.trt_logger = trt.Logger(trt.Logger.INFO) + self.engine = self._load_engine(model_config.path) + self.input_shape = self._get_input_shape() + + try: + self.context = self.engine.create_execution_context() + ( + self.inputs, + self.outputs, + self.bindings, + self.stream, + ) = self._allocate_buffers() + except Exception as e: + logger.error(e) + raise RuntimeError("fail to allocate CUDA resources") from e + + logger.debug("TensorRT loaded. Input shape is %s", self.input_shape) + logger.debug("TensorRT version is %s", trt.__version__[0]) + + def __del__(self): + """Free CUDA memories.""" + del self.outputs + del self.inputs + del self.stream + + def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold): + """Postprocess TensorRT outputs. + # Args + trt_outputs: a list of 2 or 3 tensors, where each tensor + contains a multiple of 7 float32 numbers in + the order of [x, y, w, h, box_confidence, class_id, class_prob] + conf_th: confidence threshold + # Returns + boxes, scores, classes + """ + # filter low-conf detections and concatenate results of all yolo layers + detections = [] + for o in trt_outputs: + dets = o.reshape((-1, 7)) + dets = dets[dets[:, 4] * dets[:, 6] >= conf_th] + detections.append(dets) + detections = np.concatenate(detections, axis=0) + + return detections + + # def detect(self, tensor_input, threshold=0.4): + # pass + + def detect_raw(self, tensor_input): + # Input tensor has the shape of the [height, width, 3] + # Output tensor of float32 of shape [20, 6] where: + # O - class id + # 1 - score + # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right] + + # transform [height, width, 3] into (3, H, W) + tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32) + + # normalize + tensor_input /= 255.0 + + self.inputs[0].host = np.ascontiguousarray(tensor_input) + trt_outputs = self._do_inference() + + raw_detections = self._postprocess_yolo( + trt_outputs, + tensor_input.shape[1], + tensor_input.shape[0], + self.conf_th, + nms_threshold=self.nms_threshold, + ) + + if len(raw_detections) == 0: + return np.zeros((20, 6), np.float32) + + # raw_detections: Nx7 numpy arrays of + # [[x, y, w, h, box_confidence, class_id, class_prob], + + # Calculate score as box_confidence x class_prob + raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6] + # Reorder elements by the score, best on top, remove class_prob + ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6] + # transform width to right with clamp to 0..1 + ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1) + # transform height to bottom with clamp to 0..1 + ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1) + # put result into the correct order and limit to top 20 + detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20] + # pad to 20x6 shape + append_cnt = 20 - len(detections) + if append_cnt > 0: + detections = np.append( + detections, np.zeros((append_cnt, 6), np.float32), axis=0 + ) + + # self.fps.update() + return detections diff --git a/requirements-tensorrt.txt b/requirements-tensorrt.txt new file mode 100644 index 000000000..e36fd6b18 --- /dev/null +++ b/requirements-tensorrt.txt @@ -0,0 +1,5 @@ +cuda-python == 11.8.* +tensorrt == 8.5.* +nvidia-cuda-runtime-cu11 == 11.8.* +nvidia-cublas-cu11 == 11.11.* +nvidia-cudnn-cu11 == 8.7.* \ No newline at end of file diff --git a/requirements-wheels.txt b/requirements-wheels.txt index 71962fac9..395d2a5e6 100644 --- a/requirements-wheels.txt +++ b/requirements-wheels.txt @@ -24,6 +24,8 @@ zeroconf == 0.39.4 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l' +# NVidia TensorRT Support +cuda-python == 11.8.*; platform_machine == 'x86_64' tensorrt == 8.5.*; platform_machine == 'x86_64' nvidia-cuda-runtime-cu11 == 11.8.*; platform_machine == 'x86_64' nvidia-cublas-cu11 == 11.11.*; platform_machine == 'x86_64'