From 0097ddb7cb36f8d7168155d2897e7f43de7271f0 Mon Sep 17 00:00:00 2001
From: YS <ys@gm.com>
Date: Tue, 28 Dec 2021 14:53:47 +0300
Subject: [PATCH] make TensorRT works (and break edgetpu)

---
 Makefile                       |  29 ++++-
 converters/yolo4/assets/run.sh |   1 +
 converters/yolo4/build.sh      |   2 +
 docker/Dockerfile.l4t.base     |  47 +++----
 docker/Dockerfile.l4t.nginx    |   2 +-
 frigate/config.py              |   1 +
 frigate/detection/__init__.py  | 133 +++++++++++++++++---
 frigate/detection/edgetpu.py   |  86 ++++---------
 frigate/detection/tensorrt.py  | 223 +++++++++++++++++++++++++++++++++
 frigate/http.py                |   5 +-
 frigate/object_processing.py   |   2 +-
 frigate/util.py                |   8 +-
 frigate/video.py               |  14 +--
 13 files changed, 430 insertions(+), 123 deletions(-)
 create mode 100644 frigate/detection/tensorrt.py

diff --git a/Makefile b/Makefile
index d11f9359a..002be3119 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,10 @@ amd64_ffmpeg:
 	docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-amd64 --file docker/Dockerfile.ffmpeg.amd64 .
 
 nginx_frigate:
-	docker buildx build --push --platform linux/arm/v7,linux/arm64/v8,linux/amd64 --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx .
+	docker build --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx .
+
+nginx_frigate_l4t:
+	docker build --tag blakeblackshear/frigate-nginx-l4t:1.0.2 --file docker/Dockerfile.l4t.nginx .
 
 amd64_frigate: version web
 	docker build --no-cache --tag frigate-base --build-arg ARCH=amd64 --build-arg FFMPEG_VERSION=1.1.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
@@ -41,17 +44,35 @@ aarch64_wheels:
 aarch64_ffmpeg:
 	docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.3.0-aarch64 --file docker/Dockerfile.ffmpeg.aarch64 .
 
-aarch64_frigate: version web
-	docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
+aarch64_frigate:
+	docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.3.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
 	docker build --no-cache --tag frigate --file docker/Dockerfile.aarch64 .
 
+aarch64_dev:
+	docker build --tag frigate --file docker/Dockerfile.aarch64 .
+
 aarch64_all: aarch64_wheels aarch64_ffmpeg aarch64_frigate
 
+l4t_assets_yolo4:
+	mkdir -p $$(pwd)/.l4t_assets
+	cp ./converters/yolo4/plugin/* .l4t_assets/
+	cp ./converters/yolo4/model/yolov4-tiny-416.trt .l4t_assets/yolov4-tiny-416.trt
+	cp ./converters/yolo4/model/yolov4-tiny-288.trt .l4t_assets/yolov4-tiny-288.trt
+	# cp ./converters/yolo4/model/yolov4-416.trt .l4t_assets/yolov4-416.trt
+	# cp ./converters/yolo4/model/yolov4-288.trt .l4t_assets/yolov4-288.trt
+
+l4t_dev: # l4t_assets_yolo4
+	nvidia-docker build --tag frigate.l4t --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.base .
+
+l4t_dev_test:
+	nvidia-docker build --tag frigate.l4t.onnx --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.onnx ./onnx_test/
+
+
 armv7_wheels:
 	docker build --tag blakeblackshear/frigate-wheels:1.0.3-armv7 --file docker/Dockerfile.wheels .
 
 armv7_ffmpeg:
-	docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 .
+	docker build --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 .
 
 armv7_frigate: version web
 	docker build --no-cache --tag frigate-base --build-arg ARCH=armv7 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
diff --git a/converters/yolo4/assets/run.sh b/converters/yolo4/assets/run.sh
index 7c19d224a..772ee43c0 100755
--- a/converters/yolo4/assets/run.sh
+++ b/converters/yolo4/assets/run.sh
@@ -2,6 +2,7 @@
 
 set -xe
 cd /tensorrt_demos/plugins && make
+cp /tensorrt_demos/plugins/libyolo_layer.so /plugin/libyolo_layer.so
 
 cd /tensorrt_demos/yolo
 for model in yolov4-tiny-288 \
diff --git a/converters/yolo4/build.sh b/converters/yolo4/build.sh
index 901096f0b..60d6887aa 100755
--- a/converters/yolo4/build.sh
+++ b/converters/yolo4/build.sh
@@ -1,12 +1,14 @@
 #!/bin/bash
 
 mkdir -p $(pwd)/model
+mkdir -p $(pwd)/plugin
 
 docker build --tag models.yolo4  --file ./Dockerfile.l4t.tf15 ./assets/
 
 sudo docker run --rm -it --name models.yolo4 \
     --mount type=tmpfs,target=/tmp/cache,tmpfs-size=1000000000 \
     -v $(pwd)/model:/model:rw \
+    -v $(pwd)/plugin:/plugin:rw \
     -v /tmp/argus_socket:/tmp/argus_socket \
     -e NVIDIA_VISIBLE_DEVICES=all \
     -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \
diff --git a/docker/Dockerfile.l4t.base b/docker/Dockerfile.l4t.base
index f57c50c57..548561911 100644
--- a/docker/Dockerfile.l4t.base
+++ b/docker/Dockerfile.l4t.base
@@ -1,21 +1,24 @@
 ARG NGINX_VERSION
 FROM blakeblackshear/frigate-nginx-l4t:${NGINX_VERSION} as nginx
 FROM frigate-web as web
-FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 as wheels
+FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime as wheels
 
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get -qq update \
     && apt-get -qq install -y \
-    python3.8 \
     python3.8-dev \
     wget \
     # opencv dependencies
-    build-essential cmake git pkg-config libgtk-3-dev \
+    build-essential cmake git pkg-config libgtk-3-dev
+
+RUN apt-get -qq install -y \
     libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \
-    libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \
+    libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev
+RUN apt-get -qq install -y \
     gfortran openexr libatlas-base-dev libssl-dev\
-    libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev \
+    libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev
+RUN apt-get -qq install -y \
     libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev \
     # scipy dependencies
     gcc gfortran libopenblas-dev liblapack-dev cython
@@ -39,7 +42,7 @@ RUN pip3 wheel --wheel-dir=/wheels \
     setproctitle \
     peewee
 
-FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3
+FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime
 ENV DEBIAN_FRONTEND=noninteractive
 RUN \
     apt-get update && apt-get install -y gnupg
@@ -78,20 +81,11 @@ RUN pip3 install \
     peewee_migrate \
     pydantic \
     zeroconf \
-    ws4py \
-    # Python 3.6
-    shared-memory38
+    ws4py
 
 
-# setup gstreamer
-RUN \
-  apt-get update && apt-get install -y software-properties-common && \
-  add-apt-repository universe && \
-  add-apt-repository multiverse && \
-  apt-get update
-
 RUN \
-  apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \
+  apt-get update && apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \
     gstreamer1.0-plugins-base gstreamer1.0-plugins-good \
     gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly
 
@@ -110,24 +104,35 @@ RUN wget -q https://github.com/google-coral/test_data/raw/release-frogfish/ssdli
 COPY --from=nginx /usr/local/nginx/ /usr/local/nginx/
 COPY --from=web /opt/frigate/build /opt/frigate/web/
 
+# install TRT dependencies
+RUN apt-get update && apt-get install -y git sudo
+ADD docker/l4t/ /l4t/
+RUN /l4t/install_pycuda.sh
+
 # s6-overlay
 COPY docker/rootfs/ /
 ADD https://github.com/just-containers/s6-overlay/releases/download/v2.2.0.3/s6-overlay-aarch64-installer /tmp/
 RUN chmod +x /tmp/s6-overlay-aarch64-installer && /tmp/s6-overlay-aarch64-installer /
 
 
-WORKDIR /opt/frigate/
-ADD frigate frigate/
-ADD migrations migrations/
 COPY labelmap.txt /labelmap.txt
+COPY detect.tflite /detect.tflite
 
 
 # edgetpu experiments
 RUN wget -q https://github.com/Azure/Azure-AI-Camp/releases/download/v1.0/yolov4-tiny.tflite -O /yolov4-tiny.tflite
 
+
+
+# TRT Yolo4 Plugin
+ADD .l4t_assets /yolo4/
+
 EXPOSE 5000
 EXPOSE 1935
 
+WORKDIR /opt/frigate/
+
+ADD frigate frigate/
+ADD migrations migrations/
 ENTRYPOINT ["/init"]
 CMD ["python3", "-u", "-m", "frigate"]
-COPY frigate frigate/
diff --git a/docker/Dockerfile.l4t.nginx b/docker/Dockerfile.l4t.nginx
index 06e323318..79ffc4863 100644
--- a/docker/Dockerfile.l4t.nginx
+++ b/docker/Dockerfile.l4t.nginx
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 AS base
+FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime AS base
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get -yqq update && \
diff --git a/frigate/config.py b/frigate/config.py
index 0796c2a10..b2d1a69a9 100644
--- a/frigate/config.py
+++ b/frigate/config.py
@@ -36,6 +36,7 @@ class FrigateBaseModel(BaseModel):
 class DetectorTypeEnum(str, Enum):
     edgetpu = "edgetpu"
     cpu = "cpu"
+    tensorrt = "tensorrt"
 
 
 class DetectorConfig(FrigateBaseModel):
diff --git a/frigate/detection/__init__.py b/frigate/detection/__init__.py
index d0ff60440..94180756e 100644
--- a/frigate/detection/__init__.py
+++ b/frigate/detection/__init__.py
@@ -1,12 +1,113 @@
+import datetime
 import logging
+import multiprocessing as mp
+import os
+import queue
+import signal
+import threading
+import os
 import numpy as np
 import multiprocessing as mp
-from frigate.util import EventsPerSecond
+from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen
 from frigate.config import DetectorConfig, DetectorTypeEnum
+from frigate.detection.object_detector import ObjectDetector
+import importlib
+from setproctitle import setproctitle
+from typing import Dict, Callable
+
 
 logger = logging.getLogger(__name__)
 
 
+DETECTORS = {
+    DetectorTypeEnum.cpu: "edgetpu",
+    DetectorTypeEnum.edgetpu: "edgetpu",
+    DetectorTypeEnum.tensorrt: "tensorrt",
+}
+
+
+def get_object_detector_factory(
+    detector_config: DetectorConfig, model_path: str
+) -> Callable[[], ObjectDetector]:
+    """
+    Return an object detector factory.
+    Since resource initialization might be performed on python import,
+    delay module load until the thread started
+    """
+    detector_module = DETECTORS.get(detector_config.type)
+    if detector_module is None:
+        logger.error(f"Unsupported detector type '{detector_config.type}'.")
+        return None
+
+    def _detector_factory() -> ObjectDetector:
+        path = os.path.join(os.path.dirname(__file__), f"{detector_module}.py")
+        spec = importlib.util.spec_from_file_location(
+            f"frigate.detection.{detector_module}", path
+        )
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        object_detector = module.object_detector_factory(detector_config, model_path)
+        return object_detector
+
+    return _detector_factory
+
+
+def run_detector(
+    name: str,
+    detection_queue: mp.Queue,
+    out_events: Dict[str, mp.Event],
+    avg_speed,
+    start,
+    model_shape,
+    object_detector_factory: Callable[[], ObjectDetector],
+):
+    threading.current_thread().name = f"detector:{name}"
+    logger = logging.getLogger(f"detector.{name}")
+    logger.info(f"Starting detection process: {os.getpid()}")
+    setproctitle(f"frigate.detector.{name}")
+    listen()
+
+    stop_event = mp.Event()
+
+    def receiveSignal(signalNumber, frame):
+        stop_event.set()
+
+    signal.signal(signal.SIGTERM, receiveSignal)
+    signal.signal(signal.SIGINT, receiveSignal)
+
+    frame_manager = SharedMemoryFrameManager()
+
+    outputs = {}
+    for name in out_events.keys():
+        out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False)
+        out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf)
+        outputs[name] = {"shm": out_shm, "np": out_np}
+
+    object_detector = object_detector_factory()
+    while not stop_event.is_set():
+        try:
+            connection_id = detection_queue.get(timeout=5)
+        except queue.Empty:
+            continue
+        input_frame = frame_manager.get(
+            connection_id, (model_shape[0], model_shape[1], 3)
+        )
+
+        if input_frame is None:
+            continue
+
+        # detect and send the output
+        start.value = datetime.datetime.now().timestamp()
+        detections = object_detector.detect_raw(input_frame)
+        duration = datetime.datetime.now().timestamp() - start.value
+        outputs[connection_id]["np"][:] = detections[:]
+        out_events[connection_id].set()
+        start.value = 0.0
+
+        avg_speed.value = (avg_speed.value * 9 + duration) / 10
+    del object_detector
+
+
 class DetectionProcess:
     def __init__(
         self,
@@ -27,18 +128,11 @@ class DetectionProcess:
         self.model_shape = model_shape
         self.detector_config = detector_config
 
-        self.detector_target = None
-        if (
-            detector_config.type == DetectorTypeEnum.cpu
-            or detector_config.type == DetectorTypeEnum.edgetpu
-        ):
-            from .edgetpu import run_detector as edgetpu_detector
-
-            self.detector_target = edgetpu_detector
-
-        assert self.detector_target, "Invalid detector configuration"
-
-        self.start_or_restart()
+        self.object_detector_factory = get_object_detector_factory(
+            detector_config, model_path
+        )
+        if self.object_detector_factory:
+            self.start_or_restart()
 
     def stop(self):
         self.detect_process.terminate()
@@ -54,7 +148,7 @@ class DetectionProcess:
         if (not self.detect_process is None) and self.detect_process.is_alive():
             self.stop()
         self.detect_process = mp.Process(
-            target=self.detector_target,
+            target=run_detector,
             name=f"detector:{self.name}",
             args=(
                 self.name,
@@ -62,9 +156,8 @@ class DetectionProcess:
                 self.out_events,
                 self.avg_inference_speed,
                 self.detection_start,
-                self.model_path,
                 self.model_shape,
-                self.detector_config,
+                self.object_detector_factory,
             ),
         )
         self.detect_process.daemon = True
@@ -103,9 +196,11 @@ class RemoteObjectDetector:
         for d in self.out_np_shm:
             if d[1] < threshold:
                 break
-            detections.append(
-                (self.labels[int(d[0])], float(d[1]), (d[2], d[3], d[4], d[5]))
-            )
+            label_key = int(d[0])
+            if label_key in self.labels:
+                detections.append(
+                    (self.labels[label_key], float(d[1]), (d[2], d[3], d[4], d[5]))
+                )
         self.fps.update()
         return detections
 
diff --git a/frigate/detection/edgetpu.py b/frigate/detection/edgetpu.py
index 4919ca805..41190839d 100644
--- a/frigate/detection/edgetpu.py
+++ b/frigate/detection/edgetpu.py
@@ -1,26 +1,39 @@
-import datetime
 import logging
 import multiprocessing as mp
 import os
 import queue
 import signal
 import threading
-from frigate.config import DetectorConfig
+from frigate.config import DetectorConfig, DetectorTypeEnum
 from typing import Dict
 
 import numpy as np
 
 # import tflite_runtime.interpreter as tflite
-from setproctitle import setproctitle
+
 
 # from tflite_runtime.interpreter import load_delegate
 
-from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen
+from frigate.util import EventsPerSecond
 from .object_detector import ObjectDetector
 
 logger = logging.getLogger(__name__)
 
 
+def object_detector_factory(detector_config: DetectorConfig, model_path: str):
+    if not (
+        detector_config.type == DetectorTypeEnum.cpu
+        or detector_config.type == DetectorTypeEnum.edgetpu
+    ):
+        return None
+    object_detector = LocalObjectDetector(
+        tf_device=detector_config.device,
+        model_path=model_path,
+        num_threads=detector_config.num_threads,
+    )
+    return object_detector
+
+
 class LocalObjectDetector(ObjectDetector):
     def __init__(self, tf_device=None, model_path=None, num_threads=3):
         self.fps = EventsPerSecond()
@@ -80,6 +93,11 @@ class LocalObjectDetector(ObjectDetector):
         return detections
 
     def detect_raw(self, tensor_input):
+        logger.error(">>>>>>>>>> detect raw")
+
+        # Expand dimensions [height, width, 3]  ince the model expects images to have shape [1, height, width, 3]
+        tensor_input = np.expand_dims(tensor_input, axis=0)
+
         # self.interpreter.set_tensor(self.tensor_input_details[0]["index"], tensor_input)
         # self.interpreter.invoke()
 
@@ -105,63 +123,3 @@ class LocalObjectDetector(ObjectDetector):
         #     ]
 
         return detections
-
-
-def run_detector(
-    name: str,
-    detection_queue: mp.Queue,
-    out_events: Dict[str, mp.Event],
-    avg_speed,
-    start,
-    model_path,
-    model_shape,
-    detector_config: DetectorConfig,
-):
-    threading.current_thread().name = f"detector:{name}"
-    logger = logging.getLogger(f"detector.{name}")
-    logger.info(f"Starting detection process: {os.getpid()}")
-    setproctitle(f"frigate.detector.{name}")
-    listen()
-
-    stop_event = mp.Event()
-
-    def receiveSignal(signalNumber, frame):
-        stop_event.set()
-
-    signal.signal(signal.SIGTERM, receiveSignal)
-    signal.signal(signal.SIGINT, receiveSignal)
-
-    frame_manager = SharedMemoryFrameManager()
-    object_detector = LocalObjectDetector(
-        tf_device=detector_config.device,
-        model_path=model_path,
-        num_threads=detector_config.num_threads,
-    )
-
-    outputs = {}
-    for name in out_events.keys():
-        out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False)
-        out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf)
-        outputs[name] = {"shm": out_shm, "np": out_np}
-
-    while not stop_event.is_set():
-        try:
-            connection_id = detection_queue.get(timeout=5)
-        except queue.Empty:
-            continue
-        input_frame = frame_manager.get(
-            connection_id, (1, model_shape[0], model_shape[1], 3)
-        )
-
-        if input_frame is None:
-            continue
-
-        # detect and send the output
-        start.value = datetime.datetime.now().timestamp()
-        detections = object_detector.detect_raw(input_frame)
-        duration = datetime.datetime.now().timestamp() - start.value
-        outputs[connection_id]["np"][:] = detections[:]
-        out_events[connection_id].set()
-        start.value = 0.0
-
-        avg_speed.value = (avg_speed.value * 9 + duration) / 10
diff --git a/frigate/detection/tensorrt.py b/frigate/detection/tensorrt.py
new file mode 100644
index 000000000..3791000c9
--- /dev/null
+++ b/frigate/detection/tensorrt.py
@@ -0,0 +1,223 @@
+import logging
+from frigate.config import DetectorConfig, DetectorTypeEnum
+from frigate.util import EventsPerSecond
+import ctypes
+import numpy as np
+import tensorrt as trt
+import pycuda.driver as cuda
+from .object_detector import ObjectDetector
+import pycuda.autoinit  # This is needed for initializing CUDA driver
+
+logger = logging.getLogger(__name__)
+
+
+def object_detector_factory(detector_config: DetectorConfig, model_path: str):
+    if detector_config.type != DetectorTypeEnum.tensorrt:
+        return None
+    try:
+        ctypes.cdll.LoadLibrary("/yolo4/libyolo_layer.so")
+    except OSError as e:
+        logger.error("ERROR: failed to load /yolo4/libyolo_layer.so. %s", e)
+    return LocalObjectDetector(detector_config, model_path)
+
+
+class HostDeviceMem(object):
+    """Simple helper data class that's a little nicer to use than a 2-tuple."""
+
+    def __init__(self, host_mem, device_mem):
+        self.host = host_mem
+        self.device = device_mem
+
+    def __str__(self):
+        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class LocalObjectDetector(ObjectDetector):
+    def _load_engine(self, model_path):
+        with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+
+    def _get_input_shape(self):
+        """Get input shape of the TensorRT YOLO engine."""
+        binding = self.engine[0]
+        assert self.engine.binding_is_input(binding)
+        binding_dims = self.engine.get_binding_shape(binding)
+        if len(binding_dims) == 4:
+            return tuple(binding_dims[2:])
+        elif len(binding_dims) == 3:
+            return tuple(binding_dims[1:])
+        else:
+            raise ValueError(
+                "bad dims of binding %s: %s" % (binding, str(binding_dims))
+            )
+
+    def _allocate_buffers(self):
+        """Allocates all host/device in/out buffers required for an engine."""
+        inputs = []
+        outputs = []
+        bindings = []
+        output_idx = 0
+        stream = cuda.Stream()
+        for binding in self.engine:
+            binding_dims = self.engine.get_binding_shape(binding)
+            if len(binding_dims) == 4:
+                # explicit batch case (TensorRT 7+)
+                size = trt.volume(binding_dims)
+            elif len(binding_dims) == 3:
+                # implicit batch case (TensorRT 6 or older)
+                size = trt.volume(binding_dims) * self.engine.max_batch_size
+            else:
+                raise ValueError(
+                    "bad dims of binding %s: %s" % (binding, str(binding_dims))
+                )
+            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(device_mem))
+            # Append to the appropriate list.
+            if self.engine.binding_is_input(binding):
+                inputs.append(HostDeviceMem(host_mem, device_mem))
+            else:
+                # each grid has 3 anchors, each anchor generates a detection
+                # output of 7 float32 values
+                assert size % 7 == 0
+                outputs.append(HostDeviceMem(host_mem, device_mem))
+                output_idx += 1
+        assert len(inputs) == 1
+        assert len(outputs) == 1
+        return inputs, outputs, bindings, stream
+
+    def _do_inference(self):
+        """do_inference (for TensorRT 7.0+)
+
+        This function is generalized for multiple inputs/outputs for full
+        dimension networks.
+        Inputs and outputs are expected to be lists of HostDeviceMem objects.
+        """
+        # Transfer input data to the GPU.
+        [
+            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
+            for inp in self.inputs
+        ]
+        # Run inference.
+        self.context.execute_async_v2(
+            bindings=self.bindings, stream_handle=self.stream.handle
+        )
+        # Transfer predictions back from the GPU.
+        [
+            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
+            for out in self.outputs
+        ]
+        # Synchronize the stream
+        self.stream.synchronize()
+        # Return only the host outputs.
+        return [out.host for out in self.outputs]
+
+    def __init__(self, detector_config: DetectorConfig, model_path: str):
+        self.fps = EventsPerSecond()
+        self.conf_th = 0.4  ##TODO: model config parameter
+        self.nms_threshold = 0.4
+        self.trt_logger = trt.Logger(trt.Logger.INFO)
+        self.engine = self._load_engine(model_path)
+        self.input_shape = self._get_input_shape()
+
+        try:
+            self.context = self.engine.create_execution_context()
+            (
+                self.inputs,
+                self.outputs,
+                self.bindings,
+                self.stream,
+            ) = self._allocate_buffers()
+        except Exception as e:
+            logger.error(e)
+            raise RuntimeError("fail to allocate CUDA resources") from e
+
+        logger.debug("TensorRT loaded. Input shape is %s", self.input_shape)
+        logger.debug("TensorRT version is %s", trt.__version__[0])
+
+    def __del__(self):
+        """Free CUDA memories."""
+        del self.outputs
+        del self.inputs
+        del self.stream
+
+    def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
+        """Postprocess TensorRT outputs.
+
+        # Args
+            trt_outputs: a list of 2 or 3 tensors, where each tensor
+                        contains a multiple of 7 float32 numbers in
+                        the order of [x, y, w, h, box_confidence, class_id, class_prob]
+            conf_th: confidence threshold
+
+        # Returns
+            boxes, scores, classes
+        """
+        # filter low-conf detections and concatenate results of all yolo layers
+        detections = []
+        for o in trt_outputs:
+            dets = o.reshape((-1, 7))
+            dets = dets[dets[:, 4] * dets[:, 6] >= conf_th]
+            detections.append(dets)
+        detections = np.concatenate(detections, axis=0)
+
+        return detections
+
+    def detect(self, tensor_input, threshold=0.4):
+        pass
+
+    def detect_raw(self, tensor_input):
+        # Input tensor has the shape of the [height, width, 3]
+        # Output tensor of float32 of shape [20, 6] where:
+        # O - class id
+        # 1 - score
+        # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]
+
+        # transform [height, width, 3] into (3, H, W)
+        tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
+
+        # normalize
+        tensor_input /= 255.0
+
+        self.inputs[0].host = np.ascontiguousarray(tensor_input)
+        trt_outputs = self._do_inference()
+
+        raw_detections = self._postprocess_yolo(
+            trt_outputs,
+            tensor_input.shape[1],
+            tensor_input.shape[0],
+            self.conf_th,
+            nms_threshold=self.nms_threshold,
+        )
+
+        if len(raw_detections) == 0:
+            return np.zeros((20, 6), np.float32)
+
+        # raw_detections: Nx7 numpy arrays of
+        #             [[x, y, w, h, box_confidence, class_id, class_prob],
+
+        # Calculate score as box_confidence x class_prob
+        raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6]
+        # Reorder elements by the score, best on top, remove class_prob
+        ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6]
+        # transform width to right with clamp to 0..1
+        ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1)
+        # transform height to bottom with clamp to 0..1
+        ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1)
+        # put result into the correct order and limit to top 20
+        detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20]
+        # pad to 20x6 shape
+        append_cnt = 20 - len(detections)
+        if append_cnt > 0:
+            detections = np.append(
+                detections, np.zeros((append_cnt, 6), np.float32), axis=0
+            )
+
+        self.fps.update()
+        return detections
diff --git a/frigate/http.py b/frigate/http.py
index 1627920d6..9b4999caa 100644
--- a/frigate/http.py
+++ b/frigate/http.py
@@ -358,9 +358,10 @@ def best(camera_name, label):
 
         crop = bool(request.args.get("crop", 0, type=int))
         if crop:
-            box = best_object.get("box", (0, 0, 300, 300))
+            box_size = 300
+            box = best_object.get("box", (0, 0, box_size, box_size))
             region = calculate_region(
-                best_frame.shape, box[0], box[1], box[2], box[3], 1.1
+                best_frame.shape, box[0], box[1], box[2], box[3], box_size, multiplier=1.1
             )
             best_frame = best_frame[region[1] : region[3], region[0] : region[2]]
 
diff --git a/frigate/object_processing.py b/frigate/object_processing.py
index bfbc0e414..86d1cfc01 100644
--- a/frigate/object_processing.py
+++ b/frigate/object_processing.py
@@ -262,7 +262,7 @@ class TrackedObject:
         if crop:
             box = self.thumbnail_data["box"]
             region = calculate_region(
-                best_frame.shape, box[0], box[1], box[2], box[3], 1.1
+                best_frame.shape, box[0], box[1], box[2], box[3], 300, multiplier=1.1
             )
             best_frame = best_frame[region[1] : region[3], region[0] : region[2]]
 
diff --git a/frigate/util.py b/frigate/util.py
index aa7abd004..8653cf829 100755
--- a/frigate/util.py
+++ b/frigate/util.py
@@ -189,12 +189,12 @@ def draw_box_with_label(
     )
 
 
-def calculate_region(frame_shape, xmin, ymin, xmax, ymax, multiplier=2):
+def calculate_region(frame_shape, xmin, ymin, xmax, ymax, model_size, multiplier=2):
     # size is the longest edge and divisible by 4
     size = int(max(xmax - xmin, ymax - ymin) // 4 * 4 * multiplier)
-    # dont go any smaller than 300
-    if size < 300:
-        size = 300
+    # dont go any smaller than the model_size
+    if size < model_size:
+        size = model_size
 
     # x_offset is midpoint of bounding box minus half the size
     x_offset = int((xmax - xmin) / 2.0 + xmin - size / 2.0)
diff --git a/frigate/video.py b/frigate/video.py
index c4d18cf5a..1a9f5a7be 100755
--- a/frigate/video.py
+++ b/frigate/video.py
@@ -74,14 +74,13 @@ def filtered(obj, objects_to_track, object_filters):
 def create_tensor_input(frame, model_shape, region):
     cropped_frame = yuv_region_2_rgb(frame, region)
 
-    # Resize to 300x300 if needed
+    # Resize to the model_shape if needed
     if cropped_frame.shape != (model_shape[0], model_shape[1], 3):
         cropped_frame = cv2.resize(
             cropped_frame, dsize=model_shape, interpolation=cv2.INTER_LINEAR
         )
-
-    # Expand dimensions since the model expects images to have shape: [1, height, width, 3]
-    return np.expand_dims(cropped_frame, axis=0)
+    # Return a tensor of shape: [height, width, 3] in RGB format
+    return cropped_frame
 
 
 def stop_ffmpeg(ffmpeg_process, logger):
@@ -497,9 +496,10 @@ def process_frames(
         # combine motion boxes with known locations of existing objects
         combined_boxes = reduce_boxes(motion_boxes + tracked_object_boxes)
 
+        region_min_size = max(model_shape[0], model_shape[1])
         # compute regions
         regions = [
-            calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.2)
+            calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.2)
             for a in combined_boxes
         ]
 
@@ -508,7 +508,7 @@ def process_frames(
 
         # re-compute regions
         regions = [
-            calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.0)
+            calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.0)
             for a in combined_regions
         ]
 
@@ -557,7 +557,7 @@ def process_frames(
                         box = obj[2]
                         # calculate a new region that will hopefully get the entire object
                         region = calculate_region(
-                            frame_shape, box[0], box[1], box[2], box[3]
+                            frame_shape, box[0], box[1], box[2], box[3], region_min_size
                         )
 
                         regions.append(region)