Refactor AMD GPU support (#20239)
Some checks are pending
CI / ARM Extra Build (push) Blocked by required conditions
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions

* Update ROCm to 7.0.1

* Update ONNXRuntime

* Add back in

* Get basic detection working

* Use env vars

* Handle complex migraphx models

* Enable model caching

* Remove unused

* Add tip to docs
This commit is contained in:
Nicolas Mowen 2025-09-27 13:43:11 -06:00 committed by GitHub
parent e6cbc93703
commit c207009d8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 72 additions and 33 deletions

View File

@ -15,14 +15,14 @@ ARG AMDGPU
RUN apt update -qq && \ RUN apt update -qq && \
apt install -y wget gpg && \ apt install -y wget gpg && \
wget -O rocm.deb https://repo.radeon.com/amdgpu-install/6.4.1/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb && \ wget -O rocm.deb https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/jammy/amdgpu-install_7.0.1.70001-1_all.deb && \
apt install -y ./rocm.deb && \ apt install -y ./rocm.deb && \
apt update && \ apt update && \
apt install -qq -y rocm apt install -qq -y rocm
RUN mkdir -p /opt/rocm-dist/opt/rocm-$ROCM/lib RUN mkdir -p /opt/rocm-dist/opt/rocm-$ROCM/lib
RUN cd /opt/rocm-$ROCM/lib && \ RUN cd /opt/rocm-$ROCM/lib && \
cp -dpr libMIOpen*.so* libamd*.so* libhip*.so* libhsa*.so* libmigraphx*.so* librocm*.so* librocblas*.so* libroctracer*.so* librocsolver*.so* librocfft*.so* librocprofiler*.so* libroctx*.so* /opt/rocm-dist/opt/rocm-$ROCM/lib/ && \ cp -dpr libMIOpen*.so* libamd*.so* libhip*.so* libhsa*.so* libmigraphx*.so* librocm*.so* librocblas*.so* libroctracer*.so* librocsolver*.so* librocfft*.so* librocprofiler*.so* libroctx*.so* librocroller.so* /opt/rocm-dist/opt/rocm-$ROCM/lib/ && \
mkdir -p /opt/rocm-dist/opt/rocm-$ROCM/lib/migraphx/lib && \ mkdir -p /opt/rocm-dist/opt/rocm-$ROCM/lib/migraphx/lib && \
cp -dpr migraphx/lib/* /opt/rocm-dist/opt/rocm-$ROCM/lib/migraphx/lib cp -dpr migraphx/lib/* /opt/rocm-dist/opt/rocm-$ROCM/lib/migraphx/lib
RUN cd /opt/rocm-dist/opt/ && ln -s rocm-$ROCM rocm RUN cd /opt/rocm-dist/opt/ && ln -s rocm-$ROCM rocm
@ -64,11 +64,10 @@ COPY --from=rocm /opt/rocm-dist/ /
####################################################################### #######################################################################
FROM deps-prelim AS rocm-prelim-hsa-override0 FROM deps-prelim AS rocm-prelim-hsa-override0
ENV HSA_ENABLE_SDMA=0 ENV MIGRAPHX_DISABLE_MIOPEN_FUSION=1
ENV TF_ROCM_USE_IMMEDIATE_MODE=1 ENV MIGRAPHX_DISABLE_SCHEDULE_PASS=1
ENV MIGRAPHX_DISABLE_REDUCE_FUSION=1
# avoid kernel crashes ENV MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1
ENV HIP_FORCE_DEV_KERNARG=1
COPY --from=rocm-dist / / COPY --from=rocm-dist / /

View File

@ -1 +1 @@
onnxruntime-rocm @ https://github.com/NickM-27/frigate-onnxruntime-rocm/releases/download/v6.4.1/onnxruntime_rocm-1.21.1-cp311-cp311-linux_x86_64.whl onnxruntime-migraphx @ https://github.com/NickM-27/frigate-onnxruntime-rocm/releases/download/v7.0.1/onnxruntime_migraphx-1.23.0-cp311-cp311-linux_x86_64.whl

View File

@ -2,7 +2,7 @@ variable "AMDGPU" {
default = "gfx900" default = "gfx900"
} }
variable "ROCM" { variable "ROCM" {
default = "6.4.1" default = "7.0.1"
} }
variable "HSA_OVERRIDE_GFX_VERSION" { variable "HSA_OVERRIDE_GFX_VERSION" {
default = "" default = ""

View File

@ -555,6 +555,17 @@ $ docker exec -it frigate /bin/bash -c '(unset HSA_OVERRIDE_GFX_VERSION && /opt/
### ROCm Supported Models ### ROCm Supported Models
:::tip
The AMD GPU kernel is known problematic especially when converting models to mxr format. The recommended approach is:
1. Disable object detection in the config.
2. Startup Frigate with the onnx detector configured, the main object detection model will be converted to mxr format and cached in the config directory.
3. Once this is finished as indicated by the logs, enable object detection in the UI and confirm that it is working correctly.
4. Re-enable object detection in the config.
:::
See [ONNX supported models](#supported-models) for supported models, there are some caveats: See [ONNX supported models](#supported-models) for supported models, there are some caveats:
- D-FINE models are not supported - D-FINE models are not supported

View File

@ -78,6 +78,21 @@ class BaseModelRunner(ABC):
class ONNXModelRunner(BaseModelRunner): class ONNXModelRunner(BaseModelRunner):
"""Run ONNX models using ONNX Runtime.""" """Run ONNX models using ONNX Runtime."""
@staticmethod
def is_migraphx_complex_model(model_type: str) -> bool:
# Import here to avoid circular imports
from frigate.detectors.detector_config import ModelTypeEnum
from frigate.embeddings.types import EnrichmentModelTypeEnum
return model_type in [
EnrichmentModelTypeEnum.paddleocr.value,
EnrichmentModelTypeEnum.jina_v1.value,
EnrichmentModelTypeEnum.jina_v2.value,
EnrichmentModelTypeEnum.facenet.value,
ModelTypeEnum.rfdetr.value,
ModelTypeEnum.dfine.value,
]
def __init__(self, ort: ort.InferenceSession): def __init__(self, ort: ort.InferenceSession):
self.ort = ort self.ort = ort
@ -441,6 +456,15 @@ def get_optimized_runner(
options[0]["device_id"], options[0]["device_id"],
) )
if (
providers
and providers[0] == "MIGraphXExecutionProvider"
and ONNXModelRunner.is_migraphx_complex_model(model_type)
):
# Don't use MIGraphX for models that are not supported
providers.pop(0)
options.pop(0)
return ONNXModelRunner( return ONNXModelRunner(
ort.InferenceSession( ort.InferenceSession(
model_path, model_path,

View File

@ -284,7 +284,9 @@ def post_process_yolox(
def get_ort_providers( def get_ort_providers(
force_cpu: bool = False, device: str | None = "AUTO", requires_fp16: bool = False force_cpu: bool = False,
device: str | None = "AUTO",
requires_fp16: bool = False,
) -> tuple[list[str], list[dict[str, Any]]]: ) -> tuple[list[str], list[dict[str, Any]]]:
if force_cpu: if force_cpu:
return ( return (
@ -351,12 +353,15 @@ def get_ort_providers(
} }
) )
elif provider == "MIGraphXExecutionProvider": elif provider == "MIGraphXExecutionProvider":
# MIGraphX uses more CPU than ROCM, while also being the same speed migraphx_cache_dir = os.path.join(MODEL_CACHE_DIR, "migraphx")
if device == "MIGraphX": os.makedirs(migraphx_cache_dir, exist_ok=True)
providers.append(provider) providers.append(provider)
options.append({}) options.append(
else: {
continue "migraphx_model_cache_dir": migraphx_cache_dir,
}
)
elif provider == "CPUExecutionProvider": elif provider == "CPUExecutionProvider":
providers.append(provider) providers.append(provider)
options.append( options.append(