TensorRT Cuda library rework WIP

Does not run
This commit is contained in:
Nate Meyer 2022-12-15 14:55:22 -05:00
parent ccc1218cd5
commit 556d82da7a

View File

@ -28,10 +28,12 @@ logger = logging.getLogger(__name__)
class HostDeviceMem(object): class HostDeviceMem(object):
"""Simple helper data class that's a little nicer to use than a 2-tuple.""" """Simple helper data class that's a little nicer to use than a 2-tuple."""
def __init__(self, host_mem, device_mem, nbytes): def __init__(self, host_mem, device_mem, nbytes, size):
self.host = host_mem self.host = host_mem
err, self.host_dev = cuda.cuMemHostGetDevicePointer(self.host, 0)
self.device = device_mem self.device = device_mem
self.nbytes = nbytes self.nbytes = nbytes
self.size = size
def __str__(self): def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
@ -103,7 +105,9 @@ class TensorRtDetector(DetectionApi):
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize * np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
) )
# Allocate host and device buffers # Allocate host and device buffers
err, host_mem = cuda.cuMemAllocHost(nbytes) err, host_mem = cuda.cuMemHostAlloc(
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}" assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
err, device_mem = cuda.cuMemAlloc(nbytes) err, device_mem = cuda.cuMemAlloc(nbytes)
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}" assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
@ -111,12 +115,12 @@ class TensorRtDetector(DetectionApi):
bindings.append(int(device_mem)) bindings.append(int(device_mem))
# Append to the appropriate list. # Append to the appropriate list.
if self.engine.binding_is_input(binding): if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes)) inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
else: else:
# each grid has 3 anchors, each anchor generates a detection # each grid has 3 anchors, each anchor generates a detection
# output of 7 float32 values # output of 7 float32 values
assert size % 7 == 0, f"output size was {size}" assert size % 7 == 0, f"output size was {size}"
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes)) outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
output_idx += 1 output_idx += 1
assert len(inputs) == 1, f"inputs len was {len(inputs)}" assert len(inputs) == 1, f"inputs len was {len(inputs)}"
assert len(outputs) == 1, f"output len was {len(outputs)}" assert len(outputs) == 1, f"output len was {len(outputs)}"
@ -143,9 +147,9 @@ class TensorRtDetector(DetectionApi):
for out in self.outputs for out in self.outputs
] ]
# Synchronize the stream # Synchronize the stream
self.stream.synchronize() cuda.cuStreamSynchronize(self.stream)
# Return only the host outputs. # Return only the host outputs.
return [out.host for out in self.outputs] return [np.array([int(out.host_dev)], dtype=np.float32) for out in self.outputs]
def __init__(self, det_device=None, model_config=None, num_threads=1): def __init__(self, det_device=None, model_config=None, num_threads=1):
# def __init__(self, detector_config: DetectorConfig, model_path: str): # def __init__(self, detector_config: DetectorConfig, model_path: str):