Add support for TensorRT v10 (multiple api calls have changed)

This commit is contained in:
Rémi Bédard-Couture 2024-04-30 00:17:38 -04:00
parent 1c9626ecff
commit 9a642086f9

View File

@ -6,6 +6,7 @@ import numpy as np
try: try:
import tensorrt as trt import tensorrt as trt
from cuda import cuda from cuda import cuda
TRT_VERSION=int(trt.__version__[0:trt.__version__.find(".")])
TRT_SUPPORT = True TRT_SUPPORT = True
except ModuleNotFoundError: except ModuleNotFoundError:
@ -91,22 +92,40 @@ class TensorRtDetector(DetectionApi):
def _get_input_shape(self): def _get_input_shape(self):
"""Get input shape of the TensorRT YOLO engine.""" """Get input shape of the TensorRT YOLO engine."""
binding = self.engine[0] binding = self.engine[0]
assert self.engine.binding_is_input(binding) if TRT_VERSION < 10:
binding_dims = self.engine.get_binding_shape(binding) assert self.engine.binding_is_input(binding)
if len(binding_dims) == 4: binding_dims = self.engine.get_binding_shape(binding)
return ( if len(binding_dims) == 4:
tuple(binding_dims[2:]), return (
trt.nptype(self.engine.get_binding_dtype(binding)), tuple(binding_dims[2:]),
) trt.nptype(self.engine.get_binding_dtype(binding)),
elif len(binding_dims) == 3: )
return ( elif len(binding_dims) == 3:
tuple(binding_dims[1:]), return (
trt.nptype(self.engine.get_binding_dtype(binding)), tuple(binding_dims[1:]),
) trt.nptype(self.engine.get_binding_dtype(binding)),
)
else:
raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
else: else:
raise ValueError( assert binding == "input"
"bad dims of binding %s: %s" % (binding, str(binding_dims)) binding_dims = self.engine.get_tensor_shape("input")
) if len(binding_dims) == 4:
return (
tuple(binding_dims[2:]),
trt.nptype(self.engine.get_tensor_dtype(binding)),
)
elif len(binding_dims) == 3:
return (
tuple(binding_dims[1:]),
trt.nptype(self.engine.get_tensor_dtype(binding)),
)
else:
raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
def _allocate_buffers(self): def _allocate_buffers(self):
"""Allocates all host/device in/out buffers required for an engine.""" """Allocates all host/device in/out buffers required for an engine."""
@ -115,41 +134,78 @@ class TensorRtDetector(DetectionApi):
bindings = [] bindings = []
output_idx = 0 output_idx = 0
for binding in self.engine: for binding in self.engine:
binding_dims = self.engine.get_binding_shape(binding) if TRT_VERSION < 10:
if len(binding_dims) == 4: binding_dims = self.engine.get_binding_shape(binding)
# explicit batch case (TensorRT 7+) if len(binding_dims) == 4:
size = trt.volume(binding_dims) # explicit batch case (TensorRT 7+)
elif len(binding_dims) == 3: size = trt.volume(binding_dims)
# implicit batch case (TensorRT 6 or older) elif len(binding_dims) == 3:
size = trt.volume(binding_dims) * self.engine.max_batch_size # implicit batch case (TensorRT 6 or older)
else: size = trt.volume(binding_dims) * self.engine.max_batch_size
raise ValueError( else:
"bad dims of binding %s: %s" % (binding, str(binding_dims)) raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
nbytes = size * self.engine.get_binding_dtype(binding).itemsize
# Allocate host and device buffers
err, host_mem = cuda.cuMemHostAlloc(
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
) )
nbytes = size * self.engine.get_binding_dtype(binding).itemsize assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
# Allocate host and device buffers logger.debug(
err, host_mem = cuda.cuMemHostAlloc( f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_binding_dtype(binding)})"
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP )
) err, device_mem = cuda.cuMemAlloc(nbytes)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}" assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
logger.debug( # Append the device buffer to device bindings.
f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_binding_dtype(binding)})" bindings.append(int(device_mem))
) # Append to the appropriate list.
err, device_mem = cuda.cuMemAlloc(nbytes) if self.engine.binding_is_input(binding):
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}" logger.debug(f"Input has Shape {binding_dims}")
# Append the device buffer to device bindings. inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
bindings.append(int(device_mem)) else:
# Append to the appropriate list. # each grid has 3 anchors, each anchor generates a detection
if self.engine.binding_is_input(binding): # output of 7 float32 values
logger.debug(f"Input has Shape {binding_dims}") assert size % 7 == 0, f"output size was {size}"
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size)) logger.debug(f"Output has Shape {binding_dims}")
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
output_idx += 1
else: else:
# each grid has 3 anchors, each anchor generates a detection binding_dims = self.engine.get_tensor_shape(binding)
# output of 7 float32 values if len(binding_dims) == 4:
assert size % 7 == 0, f"output size was {size}" # explicit batch case (TensorRT 7+)
logger.debug(f"Output has Shape {binding_dims}") size = trt.volume(binding_dims)
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size)) elif len(binding_dims) == 3:
output_idx += 1 # implicit batch case (TensorRT 6 or older)
size = trt.volume(binding_dims) * self.engine.max_batch_size
else:
raise ValueError(
"bad dims of binding %s: %s" % (binding, str(binding_dims))
)
nbytes = size * self.engine.get_tensor_dtype(binding).itemsize
# Allocate host and device buffers
err, host_mem = cuda.cuMemHostAlloc(
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
logger.debug(
f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_tensor_dtype(binding)})"
)
err, device_mem = cuda.cuMemAlloc(nbytes)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if binding == "input":
logger.debug(f"Input has Shape {binding_dims}")
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
else:
# each grid has 3 anchors, each anchor generates a detection
# output of 7 float32 values
assert size % 7 == 0, f"output size was {size}"
logger.debug(f"Output has Shape {binding_dims}")
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
output_idx += 1
assert len(inputs) == 1, f"inputs len was {len(inputs)}" assert len(inputs) == 1, f"inputs len was {len(inputs)}"
assert len(outputs) == 1, f"output len was {len(outputs)}" assert len(outputs) == 1, f"output len was {len(outputs)}"
return inputs, outputs, bindings return inputs, outputs, bindings
@ -170,10 +226,16 @@ class TensorRtDetector(DetectionApi):
] ]
# Run inference. # Run inference.
if not self.context.execute_async_v2( if TRT_VERSION < 10:
bindings=self.bindings, stream_handle=self.stream if not self.context.execute_async_v2(
): bindings=self.bindings, stream_handle=self.stream
logger.warn("Execute returned false") ):
logger.warn("Execute returned false")
else:
if not self.context.execute_v2(
self.bindings
):
logger.warn("Execute returned false")
# Transfer predictions back from the GPU. # Transfer predictions back from the GPU.
[ [