mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-02-03 09:45:22 +03:00
TensorRT Cuda library rework WIP
Does not run
This commit is contained in:
parent
ccc1218cd5
commit
556d82da7a
@ -28,10 +28,12 @@ logger = logging.getLogger(__name__)
|
|||||||
class HostDeviceMem(object):
|
class HostDeviceMem(object):
|
||||||
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
|
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
|
||||||
|
|
||||||
def __init__(self, host_mem, device_mem, nbytes):
|
def __init__(self, host_mem, device_mem, nbytes, size):
|
||||||
self.host = host_mem
|
self.host = host_mem
|
||||||
|
err, self.host_dev = cuda.cuMemHostGetDevicePointer(self.host, 0)
|
||||||
self.device = device_mem
|
self.device = device_mem
|
||||||
self.nbytes = nbytes
|
self.nbytes = nbytes
|
||||||
|
self.size = size
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
|
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
|
||||||
@ -103,7 +105,9 @@ class TensorRtDetector(DetectionApi):
|
|||||||
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
|
* np.dtype(trt.nptype(self.engine.get_binding_dtype(binding))).itemsize
|
||||||
)
|
)
|
||||||
# Allocate host and device buffers
|
# Allocate host and device buffers
|
||||||
err, host_mem = cuda.cuMemAllocHost(nbytes)
|
err, host_mem = cuda.cuMemHostAlloc(
|
||||||
|
nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
|
||||||
|
)
|
||||||
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
|
||||||
err, device_mem = cuda.cuMemAlloc(nbytes)
|
err, device_mem = cuda.cuMemAlloc(nbytes)
|
||||||
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
|
assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
|
||||||
@ -111,12 +115,12 @@ class TensorRtDetector(DetectionApi):
|
|||||||
bindings.append(int(device_mem))
|
bindings.append(int(device_mem))
|
||||||
# Append to the appropriate list.
|
# Append to the appropriate list.
|
||||||
if self.engine.binding_is_input(binding):
|
if self.engine.binding_is_input(binding):
|
||||||
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
|
inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
||||||
else:
|
else:
|
||||||
# each grid has 3 anchors, each anchor generates a detection
|
# each grid has 3 anchors, each anchor generates a detection
|
||||||
# output of 7 float32 values
|
# output of 7 float32 values
|
||||||
assert size % 7 == 0, f"output size was {size}"
|
assert size % 7 == 0, f"output size was {size}"
|
||||||
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes))
|
outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
|
||||||
output_idx += 1
|
output_idx += 1
|
||||||
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
assert len(inputs) == 1, f"inputs len was {len(inputs)}"
|
||||||
assert len(outputs) == 1, f"output len was {len(outputs)}"
|
assert len(outputs) == 1, f"output len was {len(outputs)}"
|
||||||
@ -143,9 +147,9 @@ class TensorRtDetector(DetectionApi):
|
|||||||
for out in self.outputs
|
for out in self.outputs
|
||||||
]
|
]
|
||||||
# Synchronize the stream
|
# Synchronize the stream
|
||||||
self.stream.synchronize()
|
cuda.cuStreamSynchronize(self.stream)
|
||||||
# Return only the host outputs.
|
# Return only the host outputs.
|
||||||
return [out.host for out in self.outputs]
|
return [np.array([int(out.host_dev)], dtype=np.float32) for out in self.outputs]
|
||||||
|
|
||||||
def __init__(self, det_device=None, model_config=None, num_threads=1):
|
def __init__(self, det_device=None, model_config=None, num_threads=1):
|
||||||
# def __init__(self, detector_config: DetectorConfig, model_path: str):
|
# def __init__(self, detector_config: DetectorConfig, model_path: str):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user