mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-02-03 17:55:21 +03:00
Updates to detector for cuda python library
This commit is contained in:
parent
4b562add75
commit
ccc1218cd5
@ -2,10 +2,10 @@
|
|||||||
|
|
||||||
set -euxo pipefail
|
set -euxo pipefail
|
||||||
|
|
||||||
echo "${CUDA_LIB_VERSION:=11.8}"
|
echo "${CUDA_LIB_VERSION:=11.7}"
|
||||||
echo "${CUDA_PKG_VERSION:=11-8}"
|
echo "${CUDA_PKG_VERSION:=11-7}"
|
||||||
echo "${CUDNN_VERSION:=8.6.0.84}"
|
echo "${CUDNN_VERSION:=8.6.0.84}"
|
||||||
echo "${TENSORRT_VERSION:=8.5.1}"
|
echo "${TENSORRT_VERSION:=8.4.1}"
|
||||||
|
|
||||||
# Add NVidia Repo
|
# Add NVidia Repo
|
||||||
apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common
|
apt-get -qq update && apt-get install -y --no-install-recommends software-properties-common
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import logging
|
|||||||
import ctypes
|
import ctypes
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorrt as trt
|
import tensorrt as trt
|
||||||
import cuda as cuda
|
from cuda import cuda as cuda
|
||||||
|
|
||||||
# import pycuda.driver as cuda
|
# import pycuda.driver as cuda
|
||||||
# from .object_detector import ObjectDetector
|
# from .object_detector import ObjectDetector
|
||||||
@ -28,9 +28,10 @@ logger = logging.getLogger(__name__)
|
|||||||
class HostDeviceMem(object):
|
class HostDeviceMem(object):
|
||||||
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
|
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
|
||||||
|
|
||||||
def __init__(self, host_mem, device_mem):
|
def __init__(self, host_mem, device_mem, nbytes):
|
||||||
self.host = host_mem
|
self.host = host_mem
|
||||||
self.device = device_mem
|
self.device = device_mem
|
||||||
|
self.nbytes = nbytes
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
|
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
|
||||||
@ -38,10 +39,29 @@ class HostDeviceMem(object):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
cuda.cuMemFreeHost(self.host)
|
||||||
|
cuda.cuMemFree(self.device)
|
||||||
|
|
||||||
|
|
||||||
class TensorRtDetector(DetectionApi):
|
class TensorRtDetector(DetectionApi):
|
||||||
# class LocalObjectDetector(ObjectDetector):
|
# class LocalObjectDetector(ObjectDetector):
|
||||||
def _load_engine(self, model_path):
|
def _load_engine(self, model_path):
|
||||||
|
try:
|
||||||
|
ctypes.cdll.LoadLibrary(
|
||||||
|
"/usr/local/lib/python3.9/dist-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0"
|
||||||
|
)
|
||||||
|
ctypes.cdll.LoadLibrary(
|
||||||
|
"/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8"
|
||||||
|
)
|
||||||
|
ctypes.cdll.LoadLibrary(
|
||||||
|
"/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so"
|
||||||
|
)
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(
|
||||||
|
"ERROR: failed to load libraries. %s",
|
||||||
|
e,
|
||||||
|
)
|
||||||
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
|
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
|
||||||
return runtime.deserialize_cuda_engine(f.read())
|
return runtime.deserialize_cuda_engine(f.read())
|
||||||
|
|
||||||
@ -65,7 +85,7 @@ class TensorRtDetector(DetectionApi):
|
|||||||
outputs = []
|
outputs = []
|
||||||
bindings = []
|
bindings = []
|
||||||
output_idx = 0
|
output_idx = 0
|
||||||
stream = cuda.cuStream()
|
err, stream = cuda.cuStreamCreate(0)
|
||||||
for binding in self.engine:
|
for binding in self.engine:
|
||||||
binding_dims = self.engine.get_binding_shape(binding)
|
binding_dims = self.engine.get_binding_shape(binding)
|
||||||
if len(binding_dims) == 4:
|
if len(binding_dims) == 4:
|
||||||
@ -78,23 +98,28 @@ class TensorRtDetector(DetectionApi):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"bad dims of binding %s: %s" % (binding, str(binding_dims))
|
"bad dims of binding %s: %s" % (binding, str(binding_dims))
|
||||||
)
|
)
|
||||||
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
|
nbytes = (
|
||||||
|
size
|
||||||
|
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
|
||||||
|
)
|
||||||
# Allocate host and device buffers
|
# Allocate host and device buffers
|
||||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
err, host_mem = cuda.cuMemAllocHost(nbytes)
|
||||||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
|
||||||
|
err, device_mem = cuda.cuMemAlloc(nbytes)
|
||||||
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
|
||||||
# Append the device buffer to device bindings.
|
# Append the device buffer to device bindings.
|
||||||
bindings.append(int(device_mem))
|
bindings.append(int(device_mem))
|
||||||
# Append to the appropriate list.
|
# Append to the appropriate list.
|
||||||
if self.engine.binding_is_input(binding):
|
if self.engine.binding_is_input(binding):
|
||||||
inputs.append(HostDeviceMem(host_mem, device_mem))
|
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
|
||||||
else:
|
else:
|
||||||
# each grid has 3 anchors, each anchor generates a detection
|
# each grid has 3 anchors, each anchor generates a detection
|
||||||
# output of 7 float32 values
|
# output of 7 float32 values
|
||||||
assert size % 7 == 0
|
assert size % 7 == 0, f"output size was {size}"
|
||||||
outputs.append(HostDeviceMem(host_mem, device_mem))
|
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
|
||||||
output_idx += 1
|
output_idx += 1
|
||||||
assert len(inputs) == 1
|
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
||||||
assert len(outputs) == 1
|
assert len(outputs) == 1, f"output len was {len(outputs)}"
|
||||||
return inputs, outputs, bindings, stream
|
return inputs, outputs, bindings, stream
|
||||||
|
|
||||||
def _do_inference(self):
|
def _do_inference(self):
|
||||||
@ -105,16 +130,16 @@ class TensorRtDetector(DetectionApi):
|
|||||||
"""
|
"""
|
||||||
# Transfer input data to the GPU.
|
# Transfer input data to the GPU.
|
||||||
[
|
[
|
||||||
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
|
cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
|
||||||
for inp in self.inputs
|
for inp in self.inputs
|
||||||
]
|
]
|
||||||
# Run inference.
|
# Run inference.
|
||||||
self.context.execute_async_v2(
|
self.context.execute_async_v2(
|
||||||
bindings=self.bindings, stream_handle=self.stream.handle
|
bindings=self.bindings, stream_handle=self.stream.getPtr()
|
||||||
)
|
)
|
||||||
# Transfer predictions back from the GPU.
|
# Transfer predictions back from the GPU.
|
||||||
[
|
[
|
||||||
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
|
cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
|
||||||
for out in self.outputs
|
for out in self.outputs
|
||||||
]
|
]
|
||||||
# Synchronize the stream
|
# Synchronize the stream
|
||||||
@ -150,6 +175,7 @@ class TensorRtDetector(DetectionApi):
|
|||||||
"""Free CUDA memories."""
|
"""Free CUDA memories."""
|
||||||
del self.outputs
|
del self.outputs
|
||||||
del self.inputs
|
del self.inputs
|
||||||
|
cuda.cuStreamDestroy(self.stream)
|
||||||
del self.stream
|
del self.stream
|
||||||
|
|
||||||
def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
|
def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
|
||||||
@ -183,10 +209,10 @@ class TensorRtDetector(DetectionApi):
|
|||||||
# 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]
|
# 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]
|
||||||
|
|
||||||
# transform [height, width, 3] into (3, H, W)
|
# transform [height, width, 3] into (3, H, W)
|
||||||
tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
|
# tensor_input = tensor_input.transpose((2, 0, 1)).astype(np.float32)
|
||||||
|
|
||||||
# normalize
|
# normalize
|
||||||
tensor_input /= 255.0
|
# tensor_input /= 255.0
|
||||||
|
|
||||||
self.inputs[0].host = np.ascontiguousarray(tensor_input)
|
self.inputs[0].host = np.ascontiguousarray(tensor_input)
|
||||||
trt_outputs = self._do_inference()
|
trt_outputs = self._do_inference()
|
||||||
|
|||||||
@ -1,9 +1,10 @@
|
|||||||
cuda-python == 11.8.*
|
nvidia-pyindex; platform_machine == 'x86_64'
|
||||||
tensorrt == 8.5.*
|
nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
|
||||||
nvidia-cuda-runtime-cu11 == 11.8.*
|
cuda-python == 11.7; platform_machine == 'x86_64'
|
||||||
nvidia-cublas-cu11 == 11.11.*
|
cython == 0.29.*; platform_machine == 'x86_64'
|
||||||
nvidia-cudnn-cu11 == 8.7.*
|
nvidia-cuda-runtime-cu11 == 11.7.*; platform_machine == 'x86_64'
|
||||||
pyindex-nvidia
|
nvidia-cublas-cu11 == 11.10.*; platform_machine == 'x86_64'
|
||||||
polygraphy-trtexec
|
nvidia-cudnn-cu11 == 8.4.*; platform_machine == 'x86_64'
|
||||||
# tensorflow
|
polygraphy
|
||||||
# easydict
|
tensorflow
|
||||||
|
easydict
|
||||||
@ -24,9 +24,14 @@ zeroconf == 0.39.4
|
|||||||
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
|
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
|
||||||
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
|
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
|
||||||
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l'
|
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.2.0/openvino-2022.2.0-000-cp39-cp39-linux_armv7l.whl; platform_machine == 'armv7l'
|
||||||
# NVidia TensorRT Support
|
# NVidia TensorRT Support (amd64 only)
|
||||||
cuda-python == 11.8.*; platform_machine == 'x86_64'
|
nvidia-pyindex; platform_machine == 'x86_64'
|
||||||
tensorrt == 8.5.*; platform_machine == 'x86_64'
|
nvidia-tensorrt == 8.4.1.5; platform_machine == 'x86_64'
|
||||||
nvidia-cuda-runtime-cu11 == 11.8.*; platform_machine == 'x86_64'
|
cuda-python == 11.7; platform_machine == 'x86_64'
|
||||||
nvidia-cublas-cu11 == 11.11.*; platform_machine == 'x86_64'
|
cython == 0.29.*; platform_machine == 'x86_64'
|
||||||
nvidia-cudnn-cu11 == 8.7.*; platform_machine == 'x86_64'
|
nvidia-cuda-runtime-cu11 == 2022.4.25; platform_machine == 'x86_64'
|
||||||
|
nvidia-cuda-runtime-cu117 == 11.7.*; platform_machine == 'x86_64'
|
||||||
|
nvidia-cublas-cu11 == 2022.4.8; platform_machine == 'x86_64'
|
||||||
|
nvidia-cublas-cu117 == 11.10.*; platform_machine == 'x86_64'
|
||||||
|
nvidia-cudnn-cu11 == 2022.5.19; platform_machine == 'x86_64'
|
||||||
|
nvidia-cudnn-cu116 == 8.4.1*; platform_machine == 'x86_64'
|
||||||
@ -1 +1,2 @@
|
|||||||
scikit-build == 0.14.1
|
scikit-build == 0.14.1
|
||||||
|
nvidia-pyindex
|
||||||
Loading…
Reference in New Issue
Block a user