From 0097ddb7cb36f8d7168155d2897e7f43de7271f0 Mon Sep 17 00:00:00 2001 From: YS Date: Tue, 28 Dec 2021 14:53:47 +0300 Subject: [PATCH] make TensorRT works (and break edgetpu) --- Makefile | 29 ++++- converters/yolo4/assets/run.sh | 1 + converters/yolo4/build.sh | 2 + docker/Dockerfile.l4t.base | 47 +++---- docker/Dockerfile.l4t.nginx | 2 +- frigate/config.py | 1 + frigate/detection/__init__.py | 133 +++++++++++++++++--- frigate/detection/edgetpu.py | 86 ++++--------- frigate/detection/tensorrt.py | 223 +++++++++++++++++++++++++++++++++ frigate/http.py | 5 +- frigate/object_processing.py | 2 +- frigate/util.py | 8 +- frigate/video.py | 14 +-- 13 files changed, 430 insertions(+), 123 deletions(-) create mode 100644 frigate/detection/tensorrt.py diff --git a/Makefile b/Makefile index d11f9359a..002be3119 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,10 @@ amd64_ffmpeg: docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-amd64 --file docker/Dockerfile.ffmpeg.amd64 . nginx_frigate: - docker buildx build --push --platform linux/arm/v7,linux/arm64/v8,linux/amd64 --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx . + docker build --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx . + +nginx_frigate_l4t: + docker build --tag blakeblackshear/frigate-nginx-l4t:1.0.2 --file docker/Dockerfile.l4t.nginx . amd64_frigate: version web docker build --no-cache --tag frigate-base --build-arg ARCH=amd64 --build-arg FFMPEG_VERSION=1.1.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . @@ -41,17 +44,35 @@ aarch64_wheels: aarch64_ffmpeg: docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.3.0-aarch64 --file docker/Dockerfile.ffmpeg.aarch64 . -aarch64_frigate: version web - docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . +aarch64_frigate: + docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.3.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . docker build --no-cache --tag frigate --file docker/Dockerfile.aarch64 . +aarch64_dev: + docker build --tag frigate --file docker/Dockerfile.aarch64 . + aarch64_all: aarch64_wheels aarch64_ffmpeg aarch64_frigate +l4t_assets_yolo4: + mkdir -p $$(pwd)/.l4t_assets + cp ./converters/yolo4/plugin/* .l4t_assets/ + cp ./converters/yolo4/model/yolov4-tiny-416.trt .l4t_assets/yolov4-tiny-416.trt + cp ./converters/yolo4/model/yolov4-tiny-288.trt .l4t_assets/yolov4-tiny-288.trt + # cp ./converters/yolo4/model/yolov4-416.trt .l4t_assets/yolov4-416.trt + # cp ./converters/yolo4/model/yolov4-288.trt .l4t_assets/yolov4-288.trt + +l4t_dev: # l4t_assets_yolo4 + nvidia-docker build --tag frigate.l4t --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.base . + +l4t_dev_test: + nvidia-docker build --tag frigate.l4t.onnx --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.onnx ./onnx_test/ + + armv7_wheels: docker build --tag blakeblackshear/frigate-wheels:1.0.3-armv7 --file docker/Dockerfile.wheels . armv7_ffmpeg: - docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 . + docker build --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 . armv7_frigate: version web docker build --no-cache --tag frigate-base --build-arg ARCH=armv7 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . diff --git a/converters/yolo4/assets/run.sh b/converters/yolo4/assets/run.sh index 7c19d224a..772ee43c0 100755 --- a/converters/yolo4/assets/run.sh +++ b/converters/yolo4/assets/run.sh @@ -2,6 +2,7 @@ set -xe cd /tensorrt_demos/plugins && make +cp /tensorrt_demos/plugins/libyolo_layer.so /plugin/libyolo_layer.so cd /tensorrt_demos/yolo for model in yolov4-tiny-288 \ diff --git a/converters/yolo4/build.sh b/converters/yolo4/build.sh index 901096f0b..60d6887aa 100755 --- a/converters/yolo4/build.sh +++ b/converters/yolo4/build.sh @@ -1,12 +1,14 @@ #!/bin/bash mkdir -p $(pwd)/model +mkdir -p $(pwd)/plugin docker build --tag models.yolo4 --file ./Dockerfile.l4t.tf15 ./assets/ sudo docker run --rm -it --name models.yolo4 \ --mount type=tmpfs,target=/tmp/cache,tmpfs-size=1000000000 \ -v $(pwd)/model:/model:rw \ + -v $(pwd)/plugin:/plugin:rw \ -v /tmp/argus_socket:/tmp/argus_socket \ -e NVIDIA_VISIBLE_DEVICES=all \ -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \ diff --git a/docker/Dockerfile.l4t.base b/docker/Dockerfile.l4t.base index f57c50c57..548561911 100644 --- a/docker/Dockerfile.l4t.base +++ b/docker/Dockerfile.l4t.base @@ -1,21 +1,24 @@ ARG NGINX_VERSION FROM blakeblackshear/frigate-nginx-l4t:${NGINX_VERSION} as nginx FROM frigate-web as web -FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 as wheels +FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime as wheels ENV DEBIAN_FRONTEND=noninteractive RUN apt-get -qq update \ && apt-get -qq install -y \ - python3.8 \ python3.8-dev \ wget \ # opencv dependencies - build-essential cmake git pkg-config libgtk-3-dev \ + build-essential cmake git pkg-config libgtk-3-dev + +RUN apt-get -qq install -y \ libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \ - libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \ + libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev +RUN apt-get -qq install -y \ gfortran openexr libatlas-base-dev libssl-dev\ - libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev \ + libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev +RUN apt-get -qq install -y \ libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev \ # scipy dependencies gcc gfortran libopenblas-dev liblapack-dev cython @@ -39,7 +42,7 @@ RUN pip3 wheel --wheel-dir=/wheels \ setproctitle \ peewee -FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 +FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime ENV DEBIAN_FRONTEND=noninteractive RUN \ apt-get update && apt-get install -y gnupg @@ -78,20 +81,11 @@ RUN pip3 install \ peewee_migrate \ pydantic \ zeroconf \ - ws4py \ - # Python 3.6 - shared-memory38 + ws4py -# setup gstreamer -RUN \ - apt-get update && apt-get install -y software-properties-common && \ - add-apt-repository universe && \ - add-apt-repository multiverse && \ - apt-get update - RUN \ - apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \ + apt-get update && apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \ gstreamer1.0-plugins-base gstreamer1.0-plugins-good \ gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly @@ -110,24 +104,35 @@ RUN wget -q https://github.com/google-coral/test_data/raw/release-frogfish/ssdli COPY --from=nginx /usr/local/nginx/ /usr/local/nginx/ COPY --from=web /opt/frigate/build /opt/frigate/web/ +# install TRT dependencies +RUN apt-get update && apt-get install -y git sudo +ADD docker/l4t/ /l4t/ +RUN /l4t/install_pycuda.sh + # s6-overlay COPY docker/rootfs/ / ADD https://github.com/just-containers/s6-overlay/releases/download/v2.2.0.3/s6-overlay-aarch64-installer /tmp/ RUN chmod +x /tmp/s6-overlay-aarch64-installer && /tmp/s6-overlay-aarch64-installer / -WORKDIR /opt/frigate/ -ADD frigate frigate/ -ADD migrations migrations/ COPY labelmap.txt /labelmap.txt +COPY detect.tflite /detect.tflite # edgetpu experiments RUN wget -q https://github.com/Azure/Azure-AI-Camp/releases/download/v1.0/yolov4-tiny.tflite -O /yolov4-tiny.tflite + + +# TRT Yolo4 Plugin +ADD .l4t_assets /yolo4/ + EXPOSE 5000 EXPOSE 1935 +WORKDIR /opt/frigate/ + +ADD frigate frigate/ +ADD migrations migrations/ ENTRYPOINT ["/init"] CMD ["python3", "-u", "-m", "frigate"] -COPY frigate frigate/ diff --git a/docker/Dockerfile.l4t.nginx b/docker/Dockerfile.l4t.nginx index 06e323318..79ffc4863 100644 --- a/docker/Dockerfile.l4t.nginx +++ b/docker/Dockerfile.l4t.nginx @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 AS base +FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime AS base ENV DEBIAN_FRONTEND=noninteractive RUN apt-get -yqq update && \ diff --git a/frigate/config.py b/frigate/config.py index 0796c2a10..b2d1a69a9 100644 --- a/frigate/config.py +++ b/frigate/config.py @@ -36,6 +36,7 @@ class FrigateBaseModel(BaseModel): class DetectorTypeEnum(str, Enum): edgetpu = "edgetpu" cpu = "cpu" + tensorrt = "tensorrt" class DetectorConfig(FrigateBaseModel): diff --git a/frigate/detection/__init__.py b/frigate/detection/__init__.py index d0ff60440..94180756e 100644 --- a/frigate/detection/__init__.py +++ b/frigate/detection/__init__.py @@ -1,12 +1,113 @@ +import datetime import logging +import multiprocessing as mp +import os +import queue +import signal +import threading +import os import numpy as np import multiprocessing as mp -from frigate.util import EventsPerSecond +from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen from frigate.config import DetectorConfig, DetectorTypeEnum +from frigate.detection.object_detector import ObjectDetector +import importlib +from setproctitle import setproctitle +from typing import Dict, Callable + logger = logging.getLogger(__name__) +DETECTORS = { + DetectorTypeEnum.cpu: "edgetpu", + DetectorTypeEnum.edgetpu: "edgetpu", + DetectorTypeEnum.tensorrt: "tensorrt", +} + + +def get_object_detector_factory( + detector_config: DetectorConfig, model_path: str +) -> Callable[[], ObjectDetector]: + """ + Return an object detector factory. + Since resource initialization might be performed on python import, + delay module load until the thread started + """ + detector_module = DETECTORS.get(detector_config.type) + if detector_module is None: + logger.error(f"Unsupported detector type '{detector_config.type}'.") + return None + + def _detector_factory() -> ObjectDetector: + path = os.path.join(os.path.dirname(__file__), f"{detector_module}.py") + spec = importlib.util.spec_from_file_location( + f"frigate.detection.{detector_module}", path + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + object_detector = module.object_detector_factory(detector_config, model_path) + return object_detector + + return _detector_factory + + +def run_detector( + name: str, + detection_queue: mp.Queue, + out_events: Dict[str, mp.Event], + avg_speed, + start, + model_shape, + object_detector_factory: Callable[[], ObjectDetector], +): + threading.current_thread().name = f"detector:{name}" + logger = logging.getLogger(f"detector.{name}") + logger.info(f"Starting detection process: {os.getpid()}") + setproctitle(f"frigate.detector.{name}") + listen() + + stop_event = mp.Event() + + def receiveSignal(signalNumber, frame): + stop_event.set() + + signal.signal(signal.SIGTERM, receiveSignal) + signal.signal(signal.SIGINT, receiveSignal) + + frame_manager = SharedMemoryFrameManager() + + outputs = {} + for name in out_events.keys(): + out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False) + out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf) + outputs[name] = {"shm": out_shm, "np": out_np} + + object_detector = object_detector_factory() + while not stop_event.is_set(): + try: + connection_id = detection_queue.get(timeout=5) + except queue.Empty: + continue + input_frame = frame_manager.get( + connection_id, (model_shape[0], model_shape[1], 3) + ) + + if input_frame is None: + continue + + # detect and send the output + start.value = datetime.datetime.now().timestamp() + detections = object_detector.detect_raw(input_frame) + duration = datetime.datetime.now().timestamp() - start.value + outputs[connection_id]["np"][:] = detections[:] + out_events[connection_id].set() + start.value = 0.0 + + avg_speed.value = (avg_speed.value * 9 + duration) / 10 + del object_detector + + class DetectionProcess: def __init__( self, @@ -27,18 +128,11 @@ class DetectionProcess: self.model_shape = model_shape self.detector_config = detector_config - self.detector_target = None - if ( - detector_config.type == DetectorTypeEnum.cpu - or detector_config.type == DetectorTypeEnum.edgetpu - ): - from .edgetpu import run_detector as edgetpu_detector - - self.detector_target = edgetpu_detector - - assert self.detector_target, "Invalid detector configuration" - - self.start_or_restart() + self.object_detector_factory = get_object_detector_factory( + detector_config, model_path + ) + if self.object_detector_factory: + self.start_or_restart() def stop(self): self.detect_process.terminate() @@ -54,7 +148,7 @@ class DetectionProcess: if (not self.detect_process is None) and self.detect_process.is_alive(): self.stop() self.detect_process = mp.Process( - target=self.detector_target, + target=run_detector, name=f"detector:{self.name}", args=( self.name, @@ -62,9 +156,8 @@ class DetectionProcess: self.out_events, self.avg_inference_speed, self.detection_start, - self.model_path, self.model_shape, - self.detector_config, + self.object_detector_factory, ), ) self.detect_process.daemon = True @@ -103,9 +196,11 @@ class RemoteObjectDetector: for d in self.out_np_shm: if d[1] < threshold: break - detections.append( - (self.labels[int(d[0])], float(d[1]), (d[2], d[3], d[4], d[5])) - ) + label_key = int(d[0]) + if label_key in self.labels: + detections.append( + (self.labels[label_key], float(d[1]), (d[2], d[3], d[4], d[5])) + ) self.fps.update() return detections diff --git a/frigate/detection/edgetpu.py b/frigate/detection/edgetpu.py index 4919ca805..41190839d 100644 --- a/frigate/detection/edgetpu.py +++ b/frigate/detection/edgetpu.py @@ -1,26 +1,39 @@ -import datetime import logging import multiprocessing as mp import os import queue import signal import threading -from frigate.config import DetectorConfig +from frigate.config import DetectorConfig, DetectorTypeEnum from typing import Dict import numpy as np # import tflite_runtime.interpreter as tflite -from setproctitle import setproctitle + # from tflite_runtime.interpreter import load_delegate -from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen +from frigate.util import EventsPerSecond from .object_detector import ObjectDetector logger = logging.getLogger(__name__) +def object_detector_factory(detector_config: DetectorConfig, model_path: str): + if not ( + detector_config.type == DetectorTypeEnum.cpu + or detector_config.type == DetectorTypeEnum.edgetpu + ): + return None + object_detector = LocalObjectDetector( + tf_device=detector_config.device, + model_path=model_path, + num_threads=detector_config.num_threads, + ) + return object_detector + + class LocalObjectDetector(ObjectDetector): def __init__(self, tf_device=None, model_path=None, num_threads=3): self.fps = EventsPerSecond() @@ -80,6 +93,11 @@ class LocalObjectDetector(ObjectDetector): return detections def detect_raw(self, tensor_input): + logger.error(">>>>>>>>>> detect raw") + + # Expand dimensions [height, width, 3] ince the model expects images to have shape [1, height, width, 3] + tensor_input = np.expand_dims(tensor_input, axis=0) + # self.interpreter.set_tensor(self.tensor_input_details[0]["index"], tensor_input) # self.interpreter.invoke() @@ -105,63 +123,3 @@ class LocalObjectDetector(ObjectDetector): # ] return detections - - -def run_detector( - name: str, - detection_queue: mp.Queue, - out_events: Dict[str, mp.Event], - avg_speed, - start, - model_path, - model_shape, - detector_config: DetectorConfig, -): - threading.current_thread().name = f"detector:{name}" - logger = logging.getLogger(f"detector.{name}") - logger.info(f"Starting detection process: {os.getpid()}") - setproctitle(f"frigate.detector.{name}") - listen() - - stop_event = mp.Event() - - def receiveSignal(signalNumber, frame): - stop_event.set() - - signal.signal(signal.SIGTERM, receiveSignal) - signal.signal(signal.SIGINT, receiveSignal) - - frame_manager = SharedMemoryFrameManager() - object_detector = LocalObjectDetector( - tf_device=detector_config.device, - model_path=model_path, - num_threads=detector_config.num_threads, - ) - - outputs = {} - for name in out_events.keys(): - out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False) - out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf) - outputs[name] = {"shm": out_shm, "np": out_np} - - while not stop_event.is_set(): - try: - connection_id = detection_queue.get(timeout=5) - except queue.Empty: - continue - input_frame = frame_manager.get( - connection_id, (1, model_shape[0], model_shape[1], 3) - ) - - if input_frame is None: - continue - - # detect and send the output - start.value = datetime.datetime.now().timestamp() - detections = object_detector.detect_raw(input_frame) - duration = datetime.datetime.now().timestamp() - start.value - outputs[connection_id]["np"][:] = detections[:] - out_events[connection_id].set() - start.value = 0.0 - - avg_speed.value = (avg_speed.value * 9 + duration) / 10 diff --git a/frigate/detection/tensorrt.py b/frigate/detection/tensorrt.py new file mode 100644 index 000000000..3791000c9 --- /dev/null +++ b/frigate/detection/tensorrt.py @@ -0,0 +1,223 @@ +import logging +from frigate.config import DetectorConfig, DetectorTypeEnum +from frigate.util import EventsPerSecond +import ctypes +import numpy as np +import tensorrt as trt +import pycuda.driver as cuda +from .object_detector import ObjectDetector +import pycuda.autoinit # This is needed for initializing CUDA driver + +logger = logging.getLogger(__name__) + + +def object_detector_factory(detector_config: DetectorConfig, model_path: str): + if detector_config.type != DetectorTypeEnum.tensorrt: + return None + try: + ctypes.cdll.LoadLibrary("/yolo4/libyolo_layer.so") + except OSError as e: + logger.error("ERROR: failed to load /yolo4/libyolo_layer.so. %s", e) + return LocalObjectDetector(detector_config, model_path) + + +class HostDeviceMem(object): + """Simple helper data class that's a little nicer to use than a 2-tuple.""" + + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + +class LocalObjectDetector(ObjectDetector): + def _load_engine(self, model_path): + with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def _get_input_shape(self): + """Get input shape of the TensorRT YOLO engine.""" + binding = self.engine[0] + assert self.engine.binding_is_input(binding) + binding_dims = self.engine.get_binding_shape(binding) + if len(binding_dims) == 4: + return tuple(binding_dims[2:]) + elif len(binding_dims) == 3: + return tuple(binding_dims[1:]) + else: + raise ValueError( + "bad dims of binding %s: %s" % (binding, str(binding_dims)) + ) + + def _allocate_buffers(self): + """Allocates all host/device in/out buffers required for an engine.""" + inputs = [] + outputs = [] + bindings = [] + output_idx = 0 + stream = cuda.Stream() + for binding in self.engine: + binding_dims = self.engine.get_binding_shape(binding) + if len(binding_dims) == 4: + # explicit batch case (TensorRT 7+) + size = trt.volume(binding_dims) + elif len(binding_dims) == 3: + # implicit batch case (TensorRT 6 or older) + size = trt.volume(binding_dims) * self.engine.max_batch_size + else: + raise ValueError( + "bad dims of binding %s: %s" % (binding, str(binding_dims)) + ) + dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if self.engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + # each grid has 3 anchors, each anchor generates a detection + # output of 7 float32 values + assert size % 7 == 0 + outputs.append(HostDeviceMem(host_mem, device_mem)) + output_idx += 1 + assert len(inputs) == 1 + assert len(outputs) == 1 + return inputs, outputs, bindings, stream + + def _do_inference(self): + """do_inference (for TensorRT 7.0+) + + This function is generalized for multiple inputs/outputs for full + dimension networks. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [ + cuda.memcpy_htod_async(inp.device, inp.host, self.stream) + for inp in self.inputs + ] + # Run inference. + self.context.execute_async_v2( + bindings=self.bindings, stream_handle=self.stream.handle + ) + # Transfer predictions back from the GPU. + [ + cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + for out in self.outputs + ] + # Synchronize the stream + self.stream.synchronize() + # Return only the host outputs. + return [out.host for out in self.outputs] + + def __init__(self, detector_config: DetectorConfig, model_path: str): + self.fps = EventsPerSecond() + self.conf_th = 0.4 ##TODO: model config parameter + self.nms_threshold = 0.4 + self.trt_logger = trt.Logger(trt.Logger.INFO) + self.engine = self._load_engine(model_path) + self.input_shape = self._get_input_shape() + + try: + self.context = self.engine.create_execution_context() + ( + self.inputs, + self.outputs, + self.bindings, + self.stream, + ) = self._allocate_buffers() + except Exception as e: + logger.error(e) + raise RuntimeError("fail to allocate CUDA resources") from e + + logger.debug("TensorRT loaded. Input shape is %s", self.input_shape) + logger.debug("TensorRT version is %s", trt.__version__[0]) + + def __del__(self): + """Free CUDA memories.""" + del self.outputs + del self.inputs + del self.stream + + def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold): + """Postprocess TensorRT outputs. + + # Args + trt_outputs: a list of 2 or 3 tensors, where each tensor + contains a multiple of 7 float32 numbers in + the order of [x, y, w, h, box_confidence, class_id, class_prob] + conf_th: confidence threshold + + # Returns + boxes, scores, classes + """ + # filter low-conf detections and concatenate results of all yolo layers + detections = [] + for o in trt_outputs: + dets = o.reshape((-1, 7)) + dets = dets[dets[:, 4] * dets[:, 6] >= conf_th] + detections.append(dets) + detections = np.concatenate(detections, axis=0) + + return detections + + def detect(self, tensor_input, threshold=0.4): + pass + + def detect_raw(self, tensor_input): + # Input tensor has the shape of the [height, width, 3] + # Output tensor of float32 of shape [20, 6] where: + # O - class id + # 1 - score + # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right] + + # transform [height, width, 3] into (3, H, W) + tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32) + + # normalize + tensor_input /= 255.0 + + self.inputs[0].host = np.ascontiguousarray(tensor_input) + trt_outputs = self._do_inference() + + raw_detections = self._postprocess_yolo( + trt_outputs, + tensor_input.shape[1], + tensor_input.shape[0], + self.conf_th, + nms_threshold=self.nms_threshold, + ) + + if len(raw_detections) == 0: + return np.zeros((20, 6), np.float32) + + # raw_detections: Nx7 numpy arrays of + # [[x, y, w, h, box_confidence, class_id, class_prob], + + # Calculate score as box_confidence x class_prob + raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6] + # Reorder elements by the score, best on top, remove class_prob + ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6] + # transform width to right with clamp to 0..1 + ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1) + # transform height to bottom with clamp to 0..1 + ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1) + # put result into the correct order and limit to top 20 + detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20] + # pad to 20x6 shape + append_cnt = 20 - len(detections) + if append_cnt > 0: + detections = np.append( + detections, np.zeros((append_cnt, 6), np.float32), axis=0 + ) + + self.fps.update() + return detections diff --git a/frigate/http.py b/frigate/http.py index 1627920d6..9b4999caa 100644 --- a/frigate/http.py +++ b/frigate/http.py @@ -358,9 +358,10 @@ def best(camera_name, label): crop = bool(request.args.get("crop", 0, type=int)) if crop: - box = best_object.get("box", (0, 0, 300, 300)) + box_size = 300 + box = best_object.get("box", (0, 0, box_size, box_size)) region = calculate_region( - best_frame.shape, box[0], box[1], box[2], box[3], 1.1 + best_frame.shape, box[0], box[1], box[2], box[3], box_size, multiplier=1.1 ) best_frame = best_frame[region[1] : region[3], region[0] : region[2]] diff --git a/frigate/object_processing.py b/frigate/object_processing.py index bfbc0e414..86d1cfc01 100644 --- a/frigate/object_processing.py +++ b/frigate/object_processing.py @@ -262,7 +262,7 @@ class TrackedObject: if crop: box = self.thumbnail_data["box"] region = calculate_region( - best_frame.shape, box[0], box[1], box[2], box[3], 1.1 + best_frame.shape, box[0], box[1], box[2], box[3], 300, multiplier=1.1 ) best_frame = best_frame[region[1] : region[3], region[0] : region[2]] diff --git a/frigate/util.py b/frigate/util.py index aa7abd004..8653cf829 100755 --- a/frigate/util.py +++ b/frigate/util.py @@ -189,12 +189,12 @@ def draw_box_with_label( ) -def calculate_region(frame_shape, xmin, ymin, xmax, ymax, multiplier=2): +def calculate_region(frame_shape, xmin, ymin, xmax, ymax, model_size, multiplier=2): # size is the longest edge and divisible by 4 size = int(max(xmax - xmin, ymax - ymin) // 4 * 4 * multiplier) - # dont go any smaller than 300 - if size < 300: - size = 300 + # dont go any smaller than the model_size + if size < model_size: + size = model_size # x_offset is midpoint of bounding box minus half the size x_offset = int((xmax - xmin) / 2.0 + xmin - size / 2.0) diff --git a/frigate/video.py b/frigate/video.py index c4d18cf5a..1a9f5a7be 100755 --- a/frigate/video.py +++ b/frigate/video.py @@ -74,14 +74,13 @@ def filtered(obj, objects_to_track, object_filters): def create_tensor_input(frame, model_shape, region): cropped_frame = yuv_region_2_rgb(frame, region) - # Resize to 300x300 if needed + # Resize to the model_shape if needed if cropped_frame.shape != (model_shape[0], model_shape[1], 3): cropped_frame = cv2.resize( cropped_frame, dsize=model_shape, interpolation=cv2.INTER_LINEAR ) - - # Expand dimensions since the model expects images to have shape: [1, height, width, 3] - return np.expand_dims(cropped_frame, axis=0) + # Return a tensor of shape: [height, width, 3] in RGB format + return cropped_frame def stop_ffmpeg(ffmpeg_process, logger): @@ -497,9 +496,10 @@ def process_frames( # combine motion boxes with known locations of existing objects combined_boxes = reduce_boxes(motion_boxes + tracked_object_boxes) + region_min_size = max(model_shape[0], model_shape[1]) # compute regions regions = [ - calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.2) + calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.2) for a in combined_boxes ] @@ -508,7 +508,7 @@ def process_frames( # re-compute regions regions = [ - calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.0) + calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.0) for a in combined_regions ] @@ -557,7 +557,7 @@ def process_frames( box = obj[2] # calculate a new region that will hopefully get the entire object region = calculate_region( - frame_shape, box[0], box[1], box[2], box[3] + frame_shape, box[0], box[1], box[2], box[3], region_min_size ) regions.append(region)