make TensorRT works (and break edgetpu)

This commit is contained in:
YS 2021-12-28 14:53:47 +03:00
parent 2d585e8458
commit 0097ddb7cb
13 changed files with 430 additions and 123 deletions

View File

@ -15,7 +15,10 @@ amd64_ffmpeg:
docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-amd64 --file docker/Dockerfile.ffmpeg.amd64 . docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-amd64 --file docker/Dockerfile.ffmpeg.amd64 .
nginx_frigate: nginx_frigate:
docker buildx build --push --platform linux/arm/v7,linux/arm64/v8,linux/amd64 --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx . docker build --tag blakeblackshear/frigate-nginx:1.0.2 --file docker/Dockerfile.nginx .
nginx_frigate_l4t:
docker build --tag blakeblackshear/frigate-nginx-l4t:1.0.2 --file docker/Dockerfile.l4t.nginx .
amd64_frigate: version web amd64_frigate: version web
docker build --no-cache --tag frigate-base --build-arg ARCH=amd64 --build-arg FFMPEG_VERSION=1.1.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . docker build --no-cache --tag frigate-base --build-arg ARCH=amd64 --build-arg FFMPEG_VERSION=1.1.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
@ -41,17 +44,35 @@ aarch64_wheels:
aarch64_ffmpeg: aarch64_ffmpeg:
docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.3.0-aarch64 --file docker/Dockerfile.ffmpeg.aarch64 . docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.3.0-aarch64 --file docker/Dockerfile.ffmpeg.aarch64 .
aarch64_frigate: version web aarch64_frigate:
docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . docker build --no-cache --tag frigate-base --build-arg ARCH=aarch64 --build-arg FFMPEG_VERSION=1.3.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .
docker build --no-cache --tag frigate --file docker/Dockerfile.aarch64 . docker build --no-cache --tag frigate --file docker/Dockerfile.aarch64 .
aarch64_dev:
docker build --tag frigate --file docker/Dockerfile.aarch64 .
aarch64_all: aarch64_wheels aarch64_ffmpeg aarch64_frigate aarch64_all: aarch64_wheels aarch64_ffmpeg aarch64_frigate
l4t_assets_yolo4:
mkdir -p $$(pwd)/.l4t_assets
cp ./converters/yolo4/plugin/* .l4t_assets/
cp ./converters/yolo4/model/yolov4-tiny-416.trt .l4t_assets/yolov4-tiny-416.trt
cp ./converters/yolo4/model/yolov4-tiny-288.trt .l4t_assets/yolov4-tiny-288.trt
# cp ./converters/yolo4/model/yolov4-416.trt .l4t_assets/yolov4-416.trt
# cp ./converters/yolo4/model/yolov4-288.trt .l4t_assets/yolov4-288.trt
l4t_dev: # l4t_assets_yolo4
nvidia-docker build --tag frigate.l4t --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.base .
l4t_dev_test:
nvidia-docker build --tag frigate.l4t.onnx --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.l4t.onnx ./onnx_test/
armv7_wheels: armv7_wheels:
docker build --tag blakeblackshear/frigate-wheels:1.0.3-armv7 --file docker/Dockerfile.wheels . docker build --tag blakeblackshear/frigate-wheels:1.0.3-armv7 --file docker/Dockerfile.wheels .
armv7_ffmpeg: armv7_ffmpeg:
docker build --no-cache --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 . docker build --pull --tag blakeblackshear/frigate-ffmpeg:1.2.0-armv7 --file docker/Dockerfile.ffmpeg.armv7 .
armv7_frigate: version web armv7_frigate: version web
docker build --no-cache --tag frigate-base --build-arg ARCH=armv7 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base . docker build --no-cache --tag frigate-base --build-arg ARCH=armv7 --build-arg FFMPEG_VERSION=1.0.0 --build-arg WHEELS_VERSION=1.0.3 --build-arg NGINX_VERSION=1.0.2 --file docker/Dockerfile.base .

View File

@ -2,6 +2,7 @@
set -xe set -xe
cd /tensorrt_demos/plugins && make cd /tensorrt_demos/plugins && make
cp /tensorrt_demos/plugins/libyolo_layer.so /plugin/libyolo_layer.so
cd /tensorrt_demos/yolo cd /tensorrt_demos/yolo
for model in yolov4-tiny-288 \ for model in yolov4-tiny-288 \

View File

@ -1,12 +1,14 @@
#!/bin/bash #!/bin/bash
mkdir -p $(pwd)/model mkdir -p $(pwd)/model
mkdir -p $(pwd)/plugin
docker build --tag models.yolo4 --file ./Dockerfile.l4t.tf15 ./assets/ docker build --tag models.yolo4 --file ./Dockerfile.l4t.tf15 ./assets/
sudo docker run --rm -it --name models.yolo4 \ sudo docker run --rm -it --name models.yolo4 \
--mount type=tmpfs,target=/tmp/cache,tmpfs-size=1000000000 \ --mount type=tmpfs,target=/tmp/cache,tmpfs-size=1000000000 \
-v $(pwd)/model:/model:rw \ -v $(pwd)/model:/model:rw \
-v $(pwd)/plugin:/plugin:rw \
-v /tmp/argus_socket:/tmp/argus_socket \ -v /tmp/argus_socket:/tmp/argus_socket \
-e NVIDIA_VISIBLE_DEVICES=all \ -e NVIDIA_VISIBLE_DEVICES=all \
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \ -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \

View File

@ -1,21 +1,24 @@
ARG NGINX_VERSION ARG NGINX_VERSION
FROM blakeblackshear/frigate-nginx-l4t:${NGINX_VERSION} as nginx FROM blakeblackshear/frigate-nginx-l4t:${NGINX_VERSION} as nginx
FROM frigate-web as web FROM frigate-web as web
FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 as wheels FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime as wheels
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update \ RUN apt-get -qq update \
&& apt-get -qq install -y \ && apt-get -qq install -y \
python3.8 \
python3.8-dev \ python3.8-dev \
wget \ wget \
# opencv dependencies # opencv dependencies
build-essential cmake git pkg-config libgtk-3-dev \ build-essential cmake git pkg-config libgtk-3-dev
RUN apt-get -qq install -y \
libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \ libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \
libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \ libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev
RUN apt-get -qq install -y \
gfortran openexr libatlas-base-dev libssl-dev\ gfortran openexr libatlas-base-dev libssl-dev\
libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev \ libtbb2 libtbb-dev libdc1394-22-dev libopenexr-dev
RUN apt-get -qq install -y \
libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev \ libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev \
# scipy dependencies # scipy dependencies
gcc gfortran libopenblas-dev liblapack-dev cython gcc gfortran libopenblas-dev liblapack-dev cython
@ -39,7 +42,7 @@ RUN pip3 wheel --wheel-dir=/wheels \
setproctitle \ setproctitle \
peewee peewee
FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
RUN \ RUN \
apt-get update && apt-get install -y gnupg apt-get update && apt-get install -y gnupg
@ -78,20 +81,11 @@ RUN pip3 install \
peewee_migrate \ peewee_migrate \
pydantic \ pydantic \
zeroconf \ zeroconf \
ws4py \ ws4py
# Python 3.6
shared-memory38
# setup gstreamer
RUN \
apt-get update && apt-get install -y software-properties-common && \
add-apt-repository universe && \
add-apt-repository multiverse && \
apt-get update
RUN \ RUN \
apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \ apt-get update && apt-get install -y gstreamer1.0-tools gstreamer1.0-alsa \
gstreamer1.0-plugins-base gstreamer1.0-plugins-good \ gstreamer1.0-plugins-base gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly
@ -110,24 +104,35 @@ RUN wget -q https://github.com/google-coral/test_data/raw/release-frogfish/ssdli
COPY --from=nginx /usr/local/nginx/ /usr/local/nginx/ COPY --from=nginx /usr/local/nginx/ /usr/local/nginx/
COPY --from=web /opt/frigate/build /opt/frigate/web/ COPY --from=web /opt/frigate/build /opt/frigate/web/
# install TRT dependencies
RUN apt-get update && apt-get install -y git sudo
ADD docker/l4t/ /l4t/
RUN /l4t/install_pycuda.sh
# s6-overlay # s6-overlay
COPY docker/rootfs/ / COPY docker/rootfs/ /
ADD https://github.com/just-containers/s6-overlay/releases/download/v2.2.0.3/s6-overlay-aarch64-installer /tmp/ ADD https://github.com/just-containers/s6-overlay/releases/download/v2.2.0.3/s6-overlay-aarch64-installer /tmp/
RUN chmod +x /tmp/s6-overlay-aarch64-installer && /tmp/s6-overlay-aarch64-installer / RUN chmod +x /tmp/s6-overlay-aarch64-installer && /tmp/s6-overlay-aarch64-installer /
WORKDIR /opt/frigate/
ADD frigate frigate/
ADD migrations migrations/
COPY labelmap.txt /labelmap.txt COPY labelmap.txt /labelmap.txt
COPY detect.tflite /detect.tflite
# edgetpu experiments # edgetpu experiments
RUN wget -q https://github.com/Azure/Azure-AI-Camp/releases/download/v1.0/yolov4-tiny.tflite -O /yolov4-tiny.tflite RUN wget -q https://github.com/Azure/Azure-AI-Camp/releases/download/v1.0/yolov4-tiny.tflite -O /yolov4-tiny.tflite
# TRT Yolo4 Plugin
ADD .l4t_assets /yolo4/
EXPOSE 5000 EXPOSE 5000
EXPOSE 1935 EXPOSE 1935
WORKDIR /opt/frigate/
ADD frigate frigate/
ADD migrations migrations/
ENTRYPOINT ["/init"] ENTRYPOINT ["/init"]
CMD ["python3", "-u", "-m", "frigate"] CMD ["python3", "-u", "-m", "frigate"]
COPY frigate frigate/

View File

@ -1,4 +1,4 @@
FROM nvcr.io/nvidia/l4t-tensorflow:r32.6.1-tf2.5-py3 AS base FROM nvcr.io/nvidia/l4t-tensorrt:r8.0.1-runtime AS base
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -yqq update && \ RUN apt-get -yqq update && \

View File

@ -36,6 +36,7 @@ class FrigateBaseModel(BaseModel):
class DetectorTypeEnum(str, Enum): class DetectorTypeEnum(str, Enum):
edgetpu = "edgetpu" edgetpu = "edgetpu"
cpu = "cpu" cpu = "cpu"
tensorrt = "tensorrt"
class DetectorConfig(FrigateBaseModel): class DetectorConfig(FrigateBaseModel):

View File

@ -1,12 +1,113 @@
import datetime
import logging import logging
import multiprocessing as mp
import os
import queue
import signal
import threading
import os
import numpy as np import numpy as np
import multiprocessing as mp import multiprocessing as mp
from frigate.util import EventsPerSecond from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen
from frigate.config import DetectorConfig, DetectorTypeEnum from frigate.config import DetectorConfig, DetectorTypeEnum
from frigate.detection.object_detector import ObjectDetector
import importlib
from setproctitle import setproctitle
from typing import Dict, Callable
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DETECTORS = {
DetectorTypeEnum.cpu: "edgetpu",
DetectorTypeEnum.edgetpu: "edgetpu",
DetectorTypeEnum.tensorrt: "tensorrt",
}
def get_object_detector_factory(
detector_config: DetectorConfig, model_path: str
) -> Callable[[], ObjectDetector]:
"""
Return an object detector factory.
Since resource initialization might be performed on python import,
delay module load until the thread started
"""
detector_module = DETECTORS.get(detector_config.type)
if detector_module is None:
logger.error(f"Unsupported detector type '{detector_config.type}'.")
return None
def _detector_factory() -> ObjectDetector:
path = os.path.join(os.path.dirname(__file__), f"{detector_module}.py")
spec = importlib.util.spec_from_file_location(
f"frigate.detection.{detector_module}", path
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
object_detector = module.object_detector_factory(detector_config, model_path)
return object_detector
return _detector_factory
def run_detector(
name: str,
detection_queue: mp.Queue,
out_events: Dict[str, mp.Event],
avg_speed,
start,
model_shape,
object_detector_factory: Callable[[], ObjectDetector],
):
threading.current_thread().name = f"detector:{name}"
logger = logging.getLogger(f"detector.{name}")
logger.info(f"Starting detection process: {os.getpid()}")
setproctitle(f"frigate.detector.{name}")
listen()
stop_event = mp.Event()
def receiveSignal(signalNumber, frame):
stop_event.set()
signal.signal(signal.SIGTERM, receiveSignal)
signal.signal(signal.SIGINT, receiveSignal)
frame_manager = SharedMemoryFrameManager()
outputs = {}
for name in out_events.keys():
out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False)
out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf)
outputs[name] = {"shm": out_shm, "np": out_np}
object_detector = object_detector_factory()
while not stop_event.is_set():
try:
connection_id = detection_queue.get(timeout=5)
except queue.Empty:
continue
input_frame = frame_manager.get(
connection_id, (model_shape[0], model_shape[1], 3)
)
if input_frame is None:
continue
# detect and send the output
start.value = datetime.datetime.now().timestamp()
detections = object_detector.detect_raw(input_frame)
duration = datetime.datetime.now().timestamp() - start.value
outputs[connection_id]["np"][:] = detections[:]
out_events[connection_id].set()
start.value = 0.0
avg_speed.value = (avg_speed.value * 9 + duration) / 10
del object_detector
class DetectionProcess: class DetectionProcess:
def __init__( def __init__(
self, self,
@ -27,18 +128,11 @@ class DetectionProcess:
self.model_shape = model_shape self.model_shape = model_shape
self.detector_config = detector_config self.detector_config = detector_config
self.detector_target = None self.object_detector_factory = get_object_detector_factory(
if ( detector_config, model_path
detector_config.type == DetectorTypeEnum.cpu )
or detector_config.type == DetectorTypeEnum.edgetpu if self.object_detector_factory:
): self.start_or_restart()
from .edgetpu import run_detector as edgetpu_detector
self.detector_target = edgetpu_detector
assert self.detector_target, "Invalid detector configuration"
self.start_or_restart()
def stop(self): def stop(self):
self.detect_process.terminate() self.detect_process.terminate()
@ -54,7 +148,7 @@ class DetectionProcess:
if (not self.detect_process is None) and self.detect_process.is_alive(): if (not self.detect_process is None) and self.detect_process.is_alive():
self.stop() self.stop()
self.detect_process = mp.Process( self.detect_process = mp.Process(
target=self.detector_target, target=run_detector,
name=f"detector:{self.name}", name=f"detector:{self.name}",
args=( args=(
self.name, self.name,
@ -62,9 +156,8 @@ class DetectionProcess:
self.out_events, self.out_events,
self.avg_inference_speed, self.avg_inference_speed,
self.detection_start, self.detection_start,
self.model_path,
self.model_shape, self.model_shape,
self.detector_config, self.object_detector_factory,
), ),
) )
self.detect_process.daemon = True self.detect_process.daemon = True
@ -103,9 +196,11 @@ class RemoteObjectDetector:
for d in self.out_np_shm: for d in self.out_np_shm:
if d[1] < threshold: if d[1] < threshold:
break break
detections.append( label_key = int(d[0])
(self.labels[int(d[0])], float(d[1]), (d[2], d[3], d[4], d[5])) if label_key in self.labels:
) detections.append(
(self.labels[label_key], float(d[1]), (d[2], d[3], d[4], d[5]))
)
self.fps.update() self.fps.update()
return detections return detections

View File

@ -1,26 +1,39 @@
import datetime
import logging import logging
import multiprocessing as mp import multiprocessing as mp
import os import os
import queue import queue
import signal import signal
import threading import threading
from frigate.config import DetectorConfig from frigate.config import DetectorConfig, DetectorTypeEnum
from typing import Dict from typing import Dict
import numpy as np import numpy as np
# import tflite_runtime.interpreter as tflite # import tflite_runtime.interpreter as tflite
from setproctitle import setproctitle
# from tflite_runtime.interpreter import load_delegate # from tflite_runtime.interpreter import load_delegate
from frigate.util import EventsPerSecond, SharedMemoryFrameManager, listen from frigate.util import EventsPerSecond
from .object_detector import ObjectDetector from .object_detector import ObjectDetector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def object_detector_factory(detector_config: DetectorConfig, model_path: str):
if not (
detector_config.type == DetectorTypeEnum.cpu
or detector_config.type == DetectorTypeEnum.edgetpu
):
return None
object_detector = LocalObjectDetector(
tf_device=detector_config.device,
model_path=model_path,
num_threads=detector_config.num_threads,
)
return object_detector
class LocalObjectDetector(ObjectDetector): class LocalObjectDetector(ObjectDetector):
def __init__(self, tf_device=None, model_path=None, num_threads=3): def __init__(self, tf_device=None, model_path=None, num_threads=3):
self.fps = EventsPerSecond() self.fps = EventsPerSecond()
@ -80,6 +93,11 @@ class LocalObjectDetector(ObjectDetector):
return detections return detections
def detect_raw(self, tensor_input): def detect_raw(self, tensor_input):
logger.error(">>>>>>>>>> detect raw")
# Expand dimensions [height, width, 3] ince the model expects images to have shape [1, height, width, 3]
tensor_input = np.expand_dims(tensor_input, axis=0)
# self.interpreter.set_tensor(self.tensor_input_details[0]["index"], tensor_input) # self.interpreter.set_tensor(self.tensor_input_details[0]["index"], tensor_input)
# self.interpreter.invoke() # self.interpreter.invoke()
@ -105,63 +123,3 @@ class LocalObjectDetector(ObjectDetector):
# ] # ]
return detections return detections
def run_detector(
name: str,
detection_queue: mp.Queue,
out_events: Dict[str, mp.Event],
avg_speed,
start,
model_path,
model_shape,
detector_config: DetectorConfig,
):
threading.current_thread().name = f"detector:{name}"
logger = logging.getLogger(f"detector.{name}")
logger.info(f"Starting detection process: {os.getpid()}")
setproctitle(f"frigate.detector.{name}")
listen()
stop_event = mp.Event()
def receiveSignal(signalNumber, frame):
stop_event.set()
signal.signal(signal.SIGTERM, receiveSignal)
signal.signal(signal.SIGINT, receiveSignal)
frame_manager = SharedMemoryFrameManager()
object_detector = LocalObjectDetector(
tf_device=detector_config.device,
model_path=model_path,
num_threads=detector_config.num_threads,
)
outputs = {}
for name in out_events.keys():
out_shm = mp.shared_memory.SharedMemory(name=f"out-{name}", create=False)
out_np = np.ndarray((20, 6), dtype=np.float32, buffer=out_shm.buf)
outputs[name] = {"shm": out_shm, "np": out_np}
while not stop_event.is_set():
try:
connection_id = detection_queue.get(timeout=5)
except queue.Empty:
continue
input_frame = frame_manager.get(
connection_id, (1, model_shape[0], model_shape[1], 3)
)
if input_frame is None:
continue
# detect and send the output
start.value = datetime.datetime.now().timestamp()
detections = object_detector.detect_raw(input_frame)
duration = datetime.datetime.now().timestamp() - start.value
outputs[connection_id]["np"][:] = detections[:]
out_events[connection_id].set()
start.value = 0.0
avg_speed.value = (avg_speed.value * 9 + duration) / 10

View File

@ -0,0 +1,223 @@
import logging
from frigate.config import DetectorConfig, DetectorTypeEnum
from frigate.util import EventsPerSecond
import ctypes
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
from .object_detector import ObjectDetector
import pycuda.autoinit # This is needed for initializing CUDA driver
logger = logging.getLogger(__name__)
def object_detector_factory(detector_config: DetectorConfig, model_path: str):
if detector_config.type != DetectorTypeEnum.tensorrt:
return None
try:
ctypes.cdll.LoadLibrary("/yolo4/libyolo_layer.so")
except OSError as e:
logger.error("ERROR: failed to load /yolo4/libyolo_layer.so. %s", e)
return LocalObjectDetector(detector_config, model_path)
class HostDeviceMem(object):
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class LocalObjectDetector(ObjectDetector):
def _load_engine(self, model_path):
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def _get_input_shape(self):
"""Get input shape of the TensorRT YOLO engine."""
binding = self.engine[0]
assert self.engine.binding_is_input(binding)
binding_dims = self.engine.get_binding_shape(binding)
if len(binding_dims) == 4:
return tuple(binding_dims[2:])
elif len(binding_dims) == 3:
return tuple(binding_dims[1:])
else:
raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
def _allocate_buffers(self):
"""Allocates all host/device in/out buffers required for an engine."""
inputs = []
outputs = []
bindings = []
output_idx = 0
stream = cuda.Stream()
for binding in self.engine:
binding_dims = self.engine.get_binding_shape(binding)
if len(binding_dims) == 4:
# explicit batch case (TensorRT 7+)
size = trt.volume(binding_dims)
elif len(binding_dims) == 3:
# implicit batch case (TensorRT 6 or older)
size = trt.volume(binding_dims) * self.engine.max_batch_size
else:
raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
# each grid has 3 anchors, each anchor generates a detection
# output of 7 float32 values
assert size % 7 == 0
outputs.append(HostDeviceMem(host_mem, device_mem))
output_idx += 1
assert len(inputs) == 1
assert len(outputs) == 1
return inputs, outputs, bindings, stream
def _do_inference(self):
"""do_inference (for TensorRT 7.0+)
This function is generalized for multiple inputs/outputs for full
dimension networks.
Inputs and outputs are expected to be lists of HostDeviceMem objects.
"""
# Transfer input data to the GPU.
[
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
for inp in self.inputs
]
# Run inference.
self.context.execute_async_v2(
bindings=self.bindings, stream_handle=self.stream.handle
)
# Transfer predictions back from the GPU.
[
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
for out in self.outputs
]
# Synchronize the stream
self.stream.synchronize()
# Return only the host outputs.
return [out.host for out in self.outputs]
def __init__(self, detector_config: DetectorConfig, model_path: str):
self.fps = EventsPerSecond()
self.conf_th = 0.4 ##TODO: model config parameter
self.nms_threshold = 0.4
self.trt_logger = trt.Logger(trt.Logger.INFO)
self.engine = self._load_engine(model_path)
self.input_shape = self._get_input_shape()
try:
self.context = self.engine.create_execution_context()
(
self.inputs,
self.outputs,
self.bindings,
self.stream,
) = self._allocate_buffers()
except Exception as e:
logger.error(e)
raise RuntimeError("fail to allocate CUDA resources") from e
logger.debug("TensorRT loaded. Input shape is %s", self.input_shape)
logger.debug("TensorRT version is %s", trt.__version__[0])
def __del__(self):
"""Free CUDA memories."""
del self.outputs
del self.inputs
del self.stream
def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
"""Postprocess TensorRT outputs.
# Args
trt_outputs: a list of 2 or 3 tensors, where each tensor
contains a multiple of 7 float32 numbers in
the order of [x, y, w, h, box_confidence, class_id, class_prob]
conf_th: confidence threshold
# Returns
boxes, scores, classes
"""
# filter low-conf detections and concatenate results of all yolo layers
detections = []
for o in trt_outputs:
dets = o.reshape((-1, 7))
dets = dets[dets[:, 4] * dets[:, 6] >= conf_th]
detections.append(dets)
detections = np.concatenate(detections, axis=0)
return detections
def detect(self, tensor_input, threshold=0.4):
pass
def detect_raw(self, tensor_input):
# Input tensor has the shape of the [height, width, 3]
# Output tensor of float32 of shape [20, 6] where:
# O - class id
# 1 - score
# 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]
# transform [height, width, 3] into (3, H, W)
tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
# normalize
tensor_input /= 255.0
self.inputs[0].host = np.ascontiguousarray(tensor_input)
trt_outputs = self._do_inference()
raw_detections = self._postprocess_yolo(
trt_outputs,
tensor_input.shape[1],
tensor_input.shape[0],
self.conf_th,
nms_threshold=self.nms_threshold,
)
if len(raw_detections) == 0:
return np.zeros((20, 6), np.float32)
# raw_detections: Nx7 numpy arrays of
# [[x, y, w, h, box_confidence, class_id, class_prob],
# Calculate score as box_confidence x class_prob
raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6]
# Reorder elements by the score, best on top, remove class_prob
ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6]
# transform width to right with clamp to 0..1
ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1)
# transform height to bottom with clamp to 0..1
ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1)
# put result into the correct order and limit to top 20
detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20]
# pad to 20x6 shape
append_cnt = 20 - len(detections)
if append_cnt > 0:
detections = np.append(
detections, np.zeros((append_cnt, 6), np.float32), axis=0
)
self.fps.update()
return detections

View File

@ -358,9 +358,10 @@ def best(camera_name, label):
crop = bool(request.args.get("crop", 0, type=int)) crop = bool(request.args.get("crop", 0, type=int))
if crop: if crop:
box = best_object.get("box", (0, 0, 300, 300)) box_size = 300
box = best_object.get("box", (0, 0, box_size, box_size))
region = calculate_region( region = calculate_region(
best_frame.shape, box[0], box[1], box[2], box[3], 1.1 best_frame.shape, box[0], box[1], box[2], box[3], box_size, multiplier=1.1
) )
best_frame = best_frame[region[1] : region[3], region[0] : region[2]] best_frame = best_frame[region[1] : region[3], region[0] : region[2]]

View File

@ -262,7 +262,7 @@ class TrackedObject:
if crop: if crop:
box = self.thumbnail_data["box"] box = self.thumbnail_data["box"]
region = calculate_region( region = calculate_region(
best_frame.shape, box[0], box[1], box[2], box[3], 1.1 best_frame.shape, box[0], box[1], box[2], box[3], 300, multiplier=1.1
) )
best_frame = best_frame[region[1] : region[3], region[0] : region[2]] best_frame = best_frame[region[1] : region[3], region[0] : region[2]]

View File

@ -189,12 +189,12 @@ def draw_box_with_label(
) )
def calculate_region(frame_shape, xmin, ymin, xmax, ymax, multiplier=2): def calculate_region(frame_shape, xmin, ymin, xmax, ymax, model_size, multiplier=2):
# size is the longest edge and divisible by 4 # size is the longest edge and divisible by 4
size = int(max(xmax - xmin, ymax - ymin) // 4 * 4 * multiplier) size = int(max(xmax - xmin, ymax - ymin) // 4 * 4 * multiplier)
# dont go any smaller than 300 # dont go any smaller than the model_size
if size < 300: if size < model_size:
size = 300 size = model_size
# x_offset is midpoint of bounding box minus half the size # x_offset is midpoint of bounding box minus half the size
x_offset = int((xmax - xmin) / 2.0 + xmin - size / 2.0) x_offset = int((xmax - xmin) / 2.0 + xmin - size / 2.0)

View File

@ -74,14 +74,13 @@ def filtered(obj, objects_to_track, object_filters):
def create_tensor_input(frame, model_shape, region): def create_tensor_input(frame, model_shape, region):
cropped_frame = yuv_region_2_rgb(frame, region) cropped_frame = yuv_region_2_rgb(frame, region)
# Resize to 300x300 if needed # Resize to the model_shape if needed
if cropped_frame.shape != (model_shape[0], model_shape[1], 3): if cropped_frame.shape != (model_shape[0], model_shape[1], 3):
cropped_frame = cv2.resize( cropped_frame = cv2.resize(
cropped_frame, dsize=model_shape, interpolation=cv2.INTER_LINEAR cropped_frame, dsize=model_shape, interpolation=cv2.INTER_LINEAR
) )
# Return a tensor of shape: [height, width, 3] in RGB format
# Expand dimensions since the model expects images to have shape: [1, height, width, 3] return cropped_frame
return np.expand_dims(cropped_frame, axis=0)
def stop_ffmpeg(ffmpeg_process, logger): def stop_ffmpeg(ffmpeg_process, logger):
@ -497,9 +496,10 @@ def process_frames(
# combine motion boxes with known locations of existing objects # combine motion boxes with known locations of existing objects
combined_boxes = reduce_boxes(motion_boxes + tracked_object_boxes) combined_boxes = reduce_boxes(motion_boxes + tracked_object_boxes)
region_min_size = max(model_shape[0], model_shape[1])
# compute regions # compute regions
regions = [ regions = [
calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.2) calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.2)
for a in combined_boxes for a in combined_boxes
] ]
@ -508,7 +508,7 @@ def process_frames(
# re-compute regions # re-compute regions
regions = [ regions = [
calculate_region(frame_shape, a[0], a[1], a[2], a[3], 1.0) calculate_region(frame_shape, a[0], a[1], a[2], a[3], region_min_size, multiplier=1.0)
for a in combined_regions for a in combined_regions
] ]
@ -557,7 +557,7 @@ def process_frames(
box = obj[2] box = obj[2]
# calculate a new region that will hopefully get the entire object # calculate a new region that will hopefully get the entire object
region = calculate_region( region = calculate_region(
frame_shape, box[0], box[1], box[2], box[3] frame_shape, box[0], box[1], box[2], box[3], region_min_size
) )
regions.append(region) regions.append(region)