frigate/frigate/util/model.py
felalex 351bef936f fix: omit gpu_mem_limit on CUDA query failure instead of guessing 4 GB
When cudaMemGetInfo cannot be called or returns an error, compute_cuda_mem_limit
now returns None and the caller skips injecting gpu_mem_limit, leaving ORT to
manage its own arena (grow-as-needed up to device capacity).

Tradeoff documented in the docstring:
- Old behavior returned a hardcoded 4 GB. That was wrong for low-VRAM devices
  (Jetson Nano 4 GB shared, Quadro K620 2 GB, GT 1030 2 GB) and broken
  /dev/nvidia* container passthroughs, where requesting 4 GB caused
  cudaErrorMemoryAllocation at session init. It was also wrong for big GPUs
  (24 GB RTX 3090 with 20 GB free), needlessly starving the session.
- The leak vectors fixed elsewhere in this PR (mem_pattern + mallopt) are
  independent of the BFC arena cap, so dropping the cap on the failure path
  does not reintroduce the leak.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-03 16:26:09 -07:00

432 lines
14 KiB
Python

"""Model Utils"""
import ctypes
import logging
import os
from typing import Any
import cv2
import numpy as np
import onnxruntime as ort
from frigate.const import MODEL_CACHE_DIR
logger = logging.getLogger(__name__)
### Post Processing
def post_process_dfine(
tensor_output: np.ndarray, width: int, height: int
) -> np.ndarray:
class_ids = tensor_output[0][tensor_output[2] > 0.4]
boxes = tensor_output[1][tensor_output[2] > 0.4]
scores = tensor_output[2][tensor_output[2] > 0.4]
input_shape = np.array([height, width, height, width])
boxes = np.divide(boxes, input_shape, dtype=np.float32)
indices = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=0.4, nms_threshold=0.4)
detections = np.zeros((20, 6), np.float32)
for i, (bbox, confidence, class_id) in enumerate(
zip(boxes[indices], scores[indices], class_ids[indices])
):
if i == 20:
break
detections[i] = [
class_id,
confidence,
bbox[1],
bbox[0],
bbox[3],
bbox[2],
]
return detections
def post_process_rfdetr(tensor_output: list[np.ndarray, np.ndarray]) -> np.ndarray:
boxes = tensor_output[0]
raw_scores = tensor_output[1]
# apply soft max to scores
exp = np.exp(raw_scores - np.max(raw_scores, axis=-1, keepdims=True))
all_scores = exp / np.sum(exp, axis=-1, keepdims=True)
# get highest scoring class from every detection
scores = np.max(all_scores[0, :, 1:], axis=-1)
labels = np.argmax(all_scores[0, :, 1:], axis=-1)
idxs = scores > 0.4
filtered_boxes = boxes[0, idxs]
filtered_scores = scores[idxs]
filtered_labels = labels[idxs]
# convert boxes from [x_center, y_center, width, height]
x_center, y_center, w, h = (
filtered_boxes[:, 0],
filtered_boxes[:, 1],
filtered_boxes[:, 2],
filtered_boxes[:, 3],
)
x_min = x_center - w / 2
y_min = y_center - h / 2
x_max = x_center + w / 2
y_max = y_center + h / 2
filtered_boxes = np.stack([x_min, y_min, x_max, y_max], axis=-1)
# apply nms
indices = cv2.dnn.NMSBoxes(
filtered_boxes, filtered_scores, score_threshold=0.4, nms_threshold=0.4
)
detections = np.zeros((20, 6), np.float32)
for i, (bbox, confidence, class_id) in enumerate(
zip(filtered_boxes[indices], filtered_scores[indices], filtered_labels[indices])
):
if i == 20:
break
detections[i] = [
class_id,
confidence,
bbox[1],
bbox[0],
bbox[3],
bbox[2],
]
return detections
def __post_process_multipart_yolo(
output_list,
width,
height,
):
anchors = [
[(12, 16), (19, 36), (40, 28)],
[(36, 75), (76, 55), (72, 146)],
[(142, 110), (192, 243), (459, 401)],
]
stride_map = {0: 8, 1: 16, 2: 32}
all_boxes = []
all_scores = []
all_class_ids = []
for i, output in enumerate(output_list):
bs, _, ny, nx = output.shape
stride = stride_map[i]
anchor_set = anchors[i]
num_anchors = len(anchor_set)
output = output.reshape(bs, num_anchors, 85, ny, nx)
output = output.transpose(0, 1, 3, 4, 2)
output = output[0]
for a_idx, (anchor_w, anchor_h) in enumerate(anchor_set):
for y in range(ny):
for x in range(nx):
pred = output[a_idx, y, x]
class_probs = pred[5:]
class_id = np.argmax(class_probs)
class_conf = class_probs[class_id]
conf = class_conf * pred[4]
if conf < 0.4:
continue
dx = pred[0]
dy = pred[1]
dw = pred[2]
dh = pred[3]
bx = ((dx * 2.0 - 0.5) + x) * stride
by = ((dy * 2.0 - 0.5) + y) * stride
bw = ((dw * 2.0) ** 2) * anchor_w
bh = ((dh * 2.0) ** 2) * anchor_h
x1 = max(0, bx - bw / 2)
y1 = max(0, by - bh / 2)
x2 = min(width, bx + bw / 2)
y2 = min(height, by + bh / 2)
all_boxes.append([x1, y1, x2, y2])
all_scores.append(conf)
all_class_ids.append(class_id)
indices = cv2.dnn.NMSBoxes(
bboxes=all_boxes,
scores=all_scores,
score_threshold=0.4,
nms_threshold=0.4,
)
results = np.zeros((20, 6), np.float32)
if len(indices) > 0:
for i, idx in enumerate(indices.flatten()[:20]):
class_id = all_class_ids[idx]
conf = all_scores[idx]
x1, y1, x2, y2 = all_boxes[idx]
results[i] = [
class_id,
conf,
y1 / height,
x1 / width,
y2 / height,
x2 / width,
]
return results
def __post_process_nms_yolo(predictions: np.ndarray, width, height) -> np.ndarray:
predictions = np.squeeze(predictions)
# transpose the output so it has order (inferences, class_ids)
if predictions.shape[0] < predictions.shape[1]:
predictions = predictions.T
scores = np.max(predictions[:, 4:], axis=1)
predictions = predictions[scores > 0.4, :]
scores = scores[scores > 0.4]
class_ids = np.argmax(predictions[:, 4:], axis=1)
# Rescale box
boxes = predictions[:, :4]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2
boxes = boxes_xyxy
# run NMS
indices = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=0.4, nms_threshold=0.4)
detections = np.zeros((20, 6), np.float32)
for i, (bbox, confidence, class_id) in enumerate(
zip(boxes[indices], scores[indices], class_ids[indices])
):
if i == 20:
break
detections[i] = [
class_id,
confidence,
bbox[1] / height,
bbox[0] / width,
bbox[3] / height,
bbox[2] / width,
]
return detections
def post_process_yolo(output: list[np.ndarray], width: int, height: int) -> np.ndarray:
if len(output) > 1:
return __post_process_multipart_yolo(output, width, height)
else:
return __post_process_nms_yolo(output[0], width, height)
def post_process_yolox(
predictions: np.ndarray,
width: int,
height: int,
grids: np.ndarray,
expanded_strides: np.ndarray,
) -> np.ndarray:
predictions[..., :2] = (predictions[..., :2] + grids) * expanded_strides
predictions[..., 2:4] = np.exp(predictions[..., 2:4]) * expanded_strides
# process organized predictions
predictions = predictions[0]
boxes = predictions[:, :4]
scores = predictions[:, 4:5] * predictions[:, 5:]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2
cls_inds = scores.argmax(1)
scores = scores[np.arange(len(cls_inds)), cls_inds]
indices = cv2.dnn.NMSBoxes(
boxes_xyxy, scores, score_threshold=0.4, nms_threshold=0.4
)
detections = np.zeros((20, 6), np.float32)
for i, (bbox, confidence, class_id) in enumerate(
zip(boxes_xyxy[indices], scores[indices], cls_inds[indices])
):
if i == 20:
break
detections[i] = [
class_id,
confidence,
bbox[1] / height,
bbox[0] / width,
bbox[3] / height,
bbox[2] / width,
]
return detections
### ONNX Utilities
def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int | None:
"""Compute a per-session GPU memory limit for the ORT CUDA EP BFC arena.
For CudaGraphRunner (YOLO detection) do NOT call this - CUDA graph capture
requires all intermediate tensors to be live simultaneously, so peak GPU memory
is 15-20x the model file size and cannot be safely capped. This function is
intended for embedding ONNXModelRunner sessions only.
Returns a limit derived from:
- min(model file size x peak_multiplier, 80% of total VRAM, 90% of free VRAM)
Returns None if the CUDA runtime query fails. The caller MUST then omit
gpu_mem_limit from provider_options so ORT falls back to its own default
(grow-as-needed up to device capacity).
Tradeoff: a hardcoded fallback (e.g. 4 GB) was previously returned here,
but that number is wrong for both ends of the spectrum:
- On Jetson Nano (4 GB shared), Quadro K620 (2 GB), GT 1030 (2 GB), and
any container where /dev/nvidia* passthrough is broken, asking for 4 GB
causes ORT session init to fail with cudaErrorMemoryAllocation.
- On a 24 GB RTX 3090 with 20 GB free, capping at 4 GB needlessly
starves the session and forces extra arena reallocations.
Returning None and letting ORT manage the arena itself is the
least-surprising behavior when we cannot actually measure VRAM. The
leak vectors this PR addresses (mem_pattern, mallopt) are independent
of the BFC arena cap, so dropping the cap on the failure path does
not reintroduce the leak.
"""
try:
libcudart = ctypes.CDLL("libcudart.so")
free_bytes = ctypes.c_size_t()
total_bytes = ctypes.c_size_t()
rc = libcudart.cudaMemGetInfo(
ctypes.byref(free_bytes), ctypes.byref(total_bytes)
)
if rc != 0 or total_bytes.value == 0:
raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
total = total_bytes.value
free = free_bytes.value
except Exception as e:
logger.debug("cudaMemGetInfo unavailable (%s); omitting gpu_mem_limit", e)
return None
peak_multiplier = 14 if cuda_graph else 7
desired = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
# Honor free VRAM so co-resident embedding sessions (jina text + vision,
# paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
return min(desired, int(total * 0.80), int(free * 0.90))
def get_ort_providers(
force_cpu: bool = False,
device: str | None = "AUTO",
requires_fp16: bool = False,
) -> tuple[list[str], list[dict[str, Any]]]:
if force_cpu:
return (
["CPUExecutionProvider"],
[
{
"enable_cpu_mem_arena": False,
}
],
)
providers = []
options = []
for provider in ort.get_available_providers():
if provider == "CUDAExecutionProvider":
device_id = 0 if (not device or not device.isdigit()) else int(device)
providers.append(provider)
options.append(
{
"arena_extend_strategy": "kSameAsRequested",
"use_ep_level_unified_stream": True,
"device_id": device_id,
}
)
elif provider == "TensorrtExecutionProvider":
# TensorrtExecutionProvider uses too much memory without options to control it
# so it is not enabled by default
if device == "Tensorrt":
os.makedirs(
os.path.join(MODEL_CACHE_DIR, "tensorrt/ort/trt-engines"),
exist_ok=True,
)
device_id = 0 if not device.isdigit() else int(device)
providers.append(provider)
options.append(
{
"device_id": device_id,
"trt_fp16_enable": requires_fp16
and os.environ.get("USE_FP_16", "True") != "False",
"trt_timing_cache_enable": True,
"trt_engine_cache_enable": True,
"trt_timing_cache_path": os.path.join(
MODEL_CACHE_DIR, "tensorrt/ort"
),
"trt_engine_cache_path": os.path.join(
MODEL_CACHE_DIR, "tensorrt/ort/trt-engines"
),
}
)
else:
continue
elif provider == "OpenVINOExecutionProvider":
# OpenVINO is used directly
if device == "OpenVINO":
os.makedirs(
os.path.join(MODEL_CACHE_DIR, "openvino/ort"), exist_ok=True
)
providers.append(provider)
options.append(
{
"cache_dir": os.path.join(MODEL_CACHE_DIR, "openvino/ort"),
"device_type": device,
}
)
elif provider == "MIGraphXExecutionProvider":
migraphx_cache_dir = os.path.join(MODEL_CACHE_DIR, "migraphx")
os.makedirs(migraphx_cache_dir, exist_ok=True)
providers.append(provider)
options.append(
{
"migraphx_model_cache_dir": migraphx_cache_dir,
}
)
elif provider == "CPUExecutionProvider":
providers.append(provider)
options.append(
{
"enable_cpu_mem_arena": False,
}
)
elif provider == "AzureExecutionProvider":
# Skip Azure provider - not typically available on local hardware
# and prevents fallback to OpenVINO when it's the first provider
continue
else:
providers.append(provider)
options.append({})
return (providers, options)