mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-02-03 09:45:22 +03:00
Use non-async interface and convert input data to float32. Detection runs without error.
This commit is contained in:
parent
643c3f21cd
commit
4198e79807
@ -74,6 +74,8 @@ class TensorRtDetector(DetectionApi):
|
|||||||
ctypes.cdll.LoadLibrary(
|
ctypes.cdll.LoadLibrary(
|
||||||
"/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8"
|
"/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8"
|
||||||
)
|
)
|
||||||
|
trt.init_libnvinfer_plugins(self.trt_logger, "")
|
||||||
|
|
||||||
ctypes.cdll.LoadLibrary(
|
ctypes.cdll.LoadLibrary(
|
||||||
"/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so"
|
"/media/frigate/models/tensorrt_demos/yolo/libyolo_layer.so"
|
||||||
)
|
)
|
||||||
@ -82,8 +84,10 @@ class TensorRtDetector(DetectionApi):
|
|||||||
"ERROR: failed to load libraries. %s",
|
"ERROR: failed to load libraries. %s",
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
|
|
||||||
return runtime.deserialize_cuda_engine(f.read())
|
self.runtime = trt.Runtime(self.trt_logger)
|
||||||
|
with open(model_path, "rb") as f:
|
||||||
|
return self.runtime.deserialize_cuda_engine(f.read())
|
||||||
|
|
||||||
def _get_input_shape(self):
|
def _get_input_shape(self):
|
||||||
"""Get input shape of the TensorRT YOLO engine."""
|
"""Get input shape of the TensorRT YOLO engine."""
|
||||||
@ -118,28 +122,28 @@ class TensorRtDetector(DetectionApi):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"bad dims of binding %s: %s" % (binding, str(binding_dims))
|
"bad dims of binding %s: %s" % (binding, str(binding_dims))
|
||||||
)
|
)
|
||||||
nbytes = (
|
nbytes = size * self.engine.get_binding_dtype(binding).itemsize
|
||||||
size
|
|
||||||
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
|
|
||||||
)
|
|
||||||
# Allocate host and device buffers
|
# Allocate host and device buffers
|
||||||
err, host_mem = cuda.cuMemHostAlloc(
|
err, host_mem = cuda.cuMemHostAlloc(
|
||||||
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
|
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
|
||||||
)
|
)
|
||||||
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
|
||||||
|
logger.debug(
|
||||||
|
f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_binding_dtype(binding)})"
|
||||||
|
)
|
||||||
err, device_mem = cuda.cuMemAlloc(nbytes)
|
err, device_mem = cuda.cuMemAlloc(nbytes)
|
||||||
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
|
||||||
# Append the device buffer to device bindings.
|
# Append the device buffer to device bindings.
|
||||||
bindings.append(int(device_mem))
|
bindings.append(int(device_mem))
|
||||||
# Append to the appropriate list.
|
# Append to the appropriate list.
|
||||||
if self.engine.binding_is_input(binding):
|
if self.engine.binding_is_input(binding):
|
||||||
logger.info(f"Input has Shape {binding_dims}")
|
logger.debug(f"Input has Shape {binding_dims}")
|
||||||
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
||||||
else:
|
else:
|
||||||
# each grid has 3 anchors, each anchor generates a detection
|
# each grid has 3 anchors, each anchor generates a detection
|
||||||
# output of 7 float32 values
|
# output of 7 float32 values
|
||||||
assert size % 7 == 0, f"output size was {size}"
|
assert size % 7 == 0, f"output size was {size}"
|
||||||
logger.info(f"Output has Shape {binding_dims}")
|
logger.debug(f"Output has Shape {binding_dims}")
|
||||||
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
||||||
output_idx += 1
|
output_idx += 1
|
||||||
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
||||||
@ -153,21 +157,14 @@ class TensorRtDetector(DetectionApi):
|
|||||||
Inputs and outputs are expected to be lists of HostDeviceMem objects.
|
Inputs and outputs are expected to be lists of HostDeviceMem objects.
|
||||||
"""
|
"""
|
||||||
# Transfer input data to the GPU.
|
# Transfer input data to the GPU.
|
||||||
[
|
[cuda.cuMemcpyHtoD(inp.device, inp.host, inp.nbytes) for inp in self.inputs]
|
||||||
cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
|
|
||||||
for inp in self.inputs
|
|
||||||
]
|
|
||||||
# Run inference.
|
# Run inference.
|
||||||
self.context.execute_async_v2(
|
if not self.context.execute_v2(bindings=self.bindings):
|
||||||
bindings=self.bindings, stream_handle=self.stream.getPtr()
|
logger.warn(f"Execute returned false")
|
||||||
)
|
|
||||||
# Transfer predictions back from the GPU.
|
# Transfer predictions back from the GPU.
|
||||||
[
|
[cuda.cuMemcpyDtoH(out.host, out.device, out.nbytes) for out in self.outputs]
|
||||||
cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
|
|
||||||
for out in self.outputs
|
|
||||||
]
|
|
||||||
# Synchronize the stream
|
# Synchronize the stream
|
||||||
cuda.cuStreamSynchronize(self.stream)
|
# cuda.cuStreamSynchronize(self.stream)
|
||||||
# Return only the host outputs.
|
# Return only the host outputs.
|
||||||
return [
|
return [
|
||||||
np.array(
|
np.array(
|
||||||
@ -177,8 +174,11 @@ class TensorRtDetector(DetectionApi):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, detector_config: TensorRTDetectorConfig):
|
def __init__(self, detector_config: TensorRTDetectorConfig):
|
||||||
# def __init__(self, detector_config: DetectorConfig, model_path: str):
|
(cuda_err,) = cuda.cuInit(0)
|
||||||
# self.fps = EventsPerSecond()
|
assert (
|
||||||
|
cuda_err == cuda.CUresult.CUDA_SUCCESS
|
||||||
|
), f"Failed to initialize cuda {cuda_err}"
|
||||||
|
err, self.cu_ctx = cuda.cuCtxCreate(cuda.CUctx_flags.CU_CTX_MAP_HOST, 0)
|
||||||
self.conf_th = 0.4 ##TODO: model config parameter
|
self.conf_th = 0.4 ##TODO: model config parameter
|
||||||
self.nms_threshold = 0.4
|
self.nms_threshold = 0.4
|
||||||
self.trt_logger = TrtLogger()
|
self.trt_logger = TrtLogger()
|
||||||
@ -206,8 +206,13 @@ class TensorRtDetector(DetectionApi):
|
|||||||
del self.inputs
|
del self.inputs
|
||||||
cuda.cuStreamDestroy(self.stream)
|
cuda.cuStreamDestroy(self.stream)
|
||||||
del self.stream
|
del self.stream
|
||||||
|
del self.engine
|
||||||
|
del self.runtime
|
||||||
|
del self.context
|
||||||
|
del self.trt_logger
|
||||||
|
cuda.cuCtxDestroy(self.cu_ctx)
|
||||||
|
|
||||||
def _postprocess_yolo(self, trt_outputs, img_w, img_h, conf_th, nms_threshold):
|
def _postprocess_yolo(self, trt_outputs, conf_th):
|
||||||
"""Postprocess TensorRT outputs.
|
"""Postprocess TensorRT outputs.
|
||||||
# Args
|
# Args
|
||||||
trt_outputs: a list of 2 or 3 tensors, where each tensor
|
trt_outputs: a list of 2 or 3 tensors, where each tensor
|
||||||
@ -240,16 +245,10 @@ class TensorRtDetector(DetectionApi):
|
|||||||
# normalize
|
# normalize
|
||||||
# tensor_input /= 255.0
|
# tensor_input /= 255.0
|
||||||
|
|
||||||
self.inputs[0].host = np.ascontiguousarray(tensor_input)
|
self.inputs[0].host = np.ascontiguousarray(tensor_input.astype(np.float32))
|
||||||
trt_outputs = self._do_inference()
|
trt_outputs = self._do_inference()
|
||||||
|
|
||||||
raw_detections = self._postprocess_yolo(
|
raw_detections = self._postprocess_yolo(trt_outputs, self.conf_th)
|
||||||
trt_outputs,
|
|
||||||
tensor_input.shape[1],
|
|
||||||
tensor_input.shape[0],
|
|
||||||
self.conf_th,
|
|
||||||
nms_threshold=self.nms_threshold,
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(raw_detections) == 0:
|
if len(raw_detections) == 0:
|
||||||
return np.zeros((20, 6), np.float32)
|
return np.zeros((20, 6), np.float32)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user