upload2

2026-02-18 17:14:26 +03:00 · 2025-02-03 22:44:02 +01:00 · 2025-02-03 22:44:02 +01:00 · 931b31452a
commit 931b31452a
parent 2c3d0a980e
336 changed files with 43217 additions and 0 deletions
--- a/docker/rocm/migraphx/targets/cpu/CMakeLists.txt
+++ b/docker/rocm/migraphx/targets/cpu/CMakeLists.txt
@ -0,0 +1,105 @@
 #####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
 include(CheckCXXCompilerFlag)
 add_library(migraphx_cpu
    allocate.cpp
    allocation_model.cpp
    binary.cpp
    concat.cpp
    convolution.cpp
    copy.cpp
    deconvolution.cpp
    dnnl.cpp
    eltwise.cpp
    erf.cpp
    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
    layernorm.cpp
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp
    reorder.cpp
    softmax.cpp
    sub.cpp
    target.cpp
    write_literals.cpp
 )
 set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
 rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
 set(MIGRAPHX_ENABLE_ZENDNN Off CACHE BOOL "")
 if(MIGRAPHX_ENABLE_ZENDNN)
    find_path(ZENDNN_INC_PATH zendnn.hpp)
    find_library(ZENDNN_LIB amdZenDNN)
    find_library(BLIS_LIB blis)
 else()
    find_package(dnnl REQUIRED)
 endif()
 rocm_clang_tidy_check(migraphx_cpu)
 if(MIGRAPHX_ENABLE_ZENDNN)
    target_compile_definitions(migraphx_cpu PRIVATE -DMIGRAPHX_ENABLE_ZENDNN)
    target_include_directories(migraphx_cpu PRIVATE ${ZENDNN_INC_PATH})
    message(STATUS "ZENDNN_LIB: ${ZENDNN_LIB}")
    target_link_libraries(migraphx_cpu PRIVATE ${BLIS_LIB})
    target_link_libraries(migraphx_cpu PRIVATE ${ZENDNN_LIB})
 else()
    target_link_libraries(migraphx_cpu PUBLIC DNNL::dnnl)
 endif()
 target_link_libraries(migraphx_cpu PRIVATE migraphx)
 migraphx_generate_export_header(migraphx_cpu)
 find_package(OpenMP)
 if(WIN32)
    target_link_libraries(migraphx_cpu PUBLIC libomp)
    target_include_directories(migraphx_cpu PUBLIC ${OpenMP_CXX_INCLUDE_DIRS})
    target_compile_options(migraphx_cpu PUBLIC ${OpenMP_CXX_FLAGS})
 else()
    target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
    # Add library path to rpath to workaround issues with our broken packages
    foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
        if(LIBRARY MATCHES "libomp")
            get_filename_component(LIBRARY_PATH "${LIBRARY}" PATH)
            target_link_libraries(migraphx_cpu PUBLIC -Wl,-rpath=${LIBRARY_PATH} -Wl,-rpath-link=${LIBRARY_PATH})
        endif()
    endforeach()
 endif()
 rocm_install_targets(
  PRIVATE
  TARGETS migraphx_cpu
  INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
--- a/docker/rocm/migraphx/targets/cpu/allocate.cpp
+++ b/docker/rocm/migraphx/targets/cpu/allocate.cpp
@ -0,0 +1,60 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/register_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_allocate : auto_register_op<cpu_allocate>
 {
    shape s;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.s, "shape"));
    }
    std::string name() const { return "cpu::allocate"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(0);
        return s;
    }
    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
    {
        argument result{output_shape};
        return result;
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/allocation_model.cpp
+++ b/docker/rocm/migraphx/targets/cpu/allocation_model.cpp
@ -0,0 +1,46 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/cpu/allocation_model.hpp>
 #include <migraphx/make_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 std::string cpu_allocation_model::name() const { return "cpu::allocate"; }
 operation cpu_allocation_model::allocate(const shape& s) const
 {
    return make_op(name(), {{"shape", to_value(s)}});
 }
 operation cpu_allocation_model::preallocate(const shape& s, const std::string& id) const
 {
    return make_op("cpu::preallocate", {{"shape", to_value(s)}, {"id", id}});
 }
 std::string cpu_allocation_model::copy() const { return "cpu::copy"; }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/binary.cpp
+++ b/docker/rocm/migraphx/targets/cpu/binary.cpp
@ -0,0 +1,83 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
 {
    std::string algo;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(self.reflect_base(self, f), pack(f(self.algo, "algo")));
    }
    std::string group() const { return this->name() + "::" + algo; }
    std::string name() const { return "dnnl::binary"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        // Compensate for allocation
        inputs.pop_back();
        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(2);
        auto s0 = inputs.at(0);
        auto s1 = inputs.at(1);
        auto r  = s0;
        if(s0 != s1 or not s0.packed())
        {
            if(s0.packed() != s1.packed())
            {
                r = s0.packed() ? s0 : s1;
            }
            else if(s0.broadcasted() != s1.broadcasted())
            {
                r = s0.broadcasted() ? s1.with_lens(s0.lens()) : s0.with_lens(s0.lens());
            }
            else
            {
                r = {s0.type(), s0.lens()};
            }
        }
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));
        return r;
    }
    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {to_dnnl_algo(algo),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_1)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/concat.cpp
+++ b/docker/rocm/migraphx/targets/cpu/concat.cpp
@ -0,0 +1,67 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 #include <migraphx/op/concat.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_concat : dnnl_extend_op<dnnl_concat, dnnl::concat, op::concat>
 {
    std::vector<int> arg_map(int size) const
    {
        std::vector<int> result(size);
        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_MULTIPLE_SRC));
        return result;
    }
    // Custom desc class since its missing in dnnl
    struct desc
    {
        dnnl::memory::desc dst;
        std::size_t axis = 1;
        std::vector<dnnl::memory::desc> srcs;
    };
    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        std::vector<dnnl::memory::desc> srcs;
        srcs.reserve(m.size() - 1);
        for(auto i = 0; i < m.size() - 1; i++)
        {
            srcs.push_back(m.at(MIGRAPHX_DNNL_PREFIX(ARG_MULTIPLE_SRC) + i));
        }
        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)), std::size_t(op.axis), srcs};
    }
    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const
    {
        return dnnl::concat::primitive_desc(d.dst, d.axis, d.srcs, get_dnnl_context().engine, attr);
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/convolution.cpp
+++ b/docker/rocm/migraphx/targets/cpu/convolution.cpp
@ -0,0 +1,86 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_convolution
    : dnnl_extend_op<dnnl_convolution, dnnl::convolution_forward, op::convolution>
 {
    std::vector<int> arg_map(int) const
    {
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }
    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
        auto s = base_adjust_shape(x, output);
        if(i == 1 and op.group > 1)
        {
            // TODO: Add support for transposed weights
            if(not s.standard())
                MIGRAPHX_THROW("Weights for grouped convolution must be standard");
            auto lens = s.lens();
            lens.insert(lens.begin(), op.group);
            lens.at(1) /= op.group;
            return shape{s.type(), lens};
        }
        return s;
    }
    dnnl::convolution_forward::desc
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        // In DNNL dilation is zero-based
        auto dilation = op.dilation;
        std::transform(
            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
        auto kdims = op.kdims();
        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
        return {dnnl::prop_kind::forward_inference,
                dnnl::algorithm::convolution_auto,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                to_dnnl_dims(op.stride),
                to_dnnl_dims(dilation),
                to_dnnl_dims(padding_l),
                to_dnnl_dims(padding_r)};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/copy.cpp
+++ b/docker/rocm/migraphx/targets/cpu/copy.cpp
@ -0,0 +1,65 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy>
 {
    template <class Self, class F>
    static auto reflect(Self&, F)
    {
        return pack();
    }
    std::string name() const { return "cpu::copy"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
        return inputs.at(1);
    }
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);
        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
            pointwise(output, input)(ctx, output.get_shape(), 1024, [](auto& y, auto x) { y = x; });
        });
        return result.reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/deconvolution.cpp
+++ b/docker/rocm/migraphx/targets/cpu/deconvolution.cpp
@ -0,0 +1,76 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/convolution_backwards.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_deconvolution
    : dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::convolution_backwards>
 {
    std::vector<int> arg_map(int) const
    {
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }
    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
        auto s = base_adjust_shape(x, output);
        if(i == 1)
        {
            // The input and output channels are flipped for dnnl
            auto lens = s.lens();
            std::swap(lens[0], lens[1]);
            auto strides = s.strides();
            std::swap(strides[0], strides[1]);
            return {s.type(), lens, strides};
        }
        return s;
    }
    dnnl::deconvolution_forward::desc
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        // In DNNL dilation is zero-based
        auto dilation = op.dilation;
        std::transform(
            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
        return {dnnl::prop_kind::forward_inference,
                dnnl::algorithm::deconvolution_direct,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                to_dnnl_dims(op.stride),
                to_dnnl_dims(dilation),
                to_dnnl_dims(op.padding),
                to_dnnl_dims(op.padding)};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/dnnl.cpp
+++ b/docker/rocm/migraphx/targets/cpu/dnnl.cpp
@ -0,0 +1,205 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/cpu/dnnl.hpp>
 #if defined(__GNUC__) && __GNUC__ <= 5
 namespace std {
 #ifdef MIGRAPHX_ENABLE_ZENDNN
 namespace dnnl = zendnn;
 #endif
 template <>
 struct hash<dnnl::algorithm>
 {
    using argument_type = dnnl::algorithm;
    using result_type   = std::size_t;
    result_type operator()(const argument_type& x) const noexcept
    {
        return std::hash<underlying_type_t<argument_type>>{}(
            static_cast<underlying_type_t<argument_type>>(x));
    }
 };
 } // namespace std
 #endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 dnnl_context& get_dnnl_context()
 {
    static dnnl_context ctx{}; // NOLINT
    return ctx;
 }
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wswitch-enum"
 #endif
 dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
 {
    using dt = dnnl::memory::data_type;
    using st = shape::type_t;
    switch(t)
    {
    case st::half_type: return dt::f16;
    case st::float_type: return dt::f32;
    case st::int32_type: return dt::s32;
    case st::int8_type: return dt::s8;
    case st::uint8_type: return dt::u8;
    case st::fp8e4m3fnuz_type: MIGRAPHX_THROW("fp8e4m3fnuz unsupported in DNNL");
    default: MIGRAPHX_THROW("Unsupported data type");
    }
 }
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
 {
    switch(n)
    {
    case 1: return dnnl::memory::format_tag::a;
    case 2: return dnnl::memory::format_tag::ab;
    case 3: return dnnl::memory::format_tag::abc;
    case 4: return dnnl::memory::format_tag::abcd;
    case 5: return dnnl::memory::format_tag::abcde;
    case 6: return dnnl::memory::format_tag::abcdef;
    default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
    }
 }
 dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
 {
    return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
 }
 dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
 {
    return {desc, get_dnnl_context().engine, a.data()};
 }
 dnnl::memory to_dnnl_memory(const argument& a)
 {
    return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
 }
 // clang-format off
 #define MIGRAPHX_VISIT_DNNL_ALGO(m) \
        m(undef) \
        m(convolution_auto) \
        m(convolution_direct) \
        m(convolution_winograd) \
        m(deconvolution_direct) \
        m(deconvolution_winograd) \
        m(eltwise_relu) \
        m(eltwise_tanh) \
        m(eltwise_elu) \
        m(eltwise_square) \
        m(eltwise_abs) \
        m(eltwise_sqrt) \
        m(eltwise_swish) \
        m(eltwise_linear) \
        m(eltwise_bounded_relu) \
        m(eltwise_soft_relu) \
        m(eltwise_logistic) \
        m(eltwise_exp) \
        m(eltwise_gelu) \
        m(eltwise_gelu_tanh) \
        m(eltwise_gelu_erf) \
        m(eltwise_log) \
        m(eltwise_clip) \
        m(eltwise_pow) \
        m(eltwise_round) \
        m(eltwise_relu_use_dst_for_bwd) \
        m(eltwise_tanh_use_dst_for_bwd) \
        m(eltwise_elu_use_dst_for_bwd) \
        m(eltwise_sqrt_use_dst_for_bwd) \
        m(eltwise_logistic_use_dst_for_bwd) \
        m(eltwise_exp_use_dst_for_bwd) \
        m(lrn_across_channels) \
        m(lrn_within_channel) \
        m(pooling_max) \
        m(pooling_avg) \
        m(pooling_avg_include_padding) \
        m(pooling_avg_exclude_padding) \
        m(vanilla_rnn) \
        m(vanilla_lstm) \
        m(vanilla_gru) \
        m(lbr_gru) \
        m(binary_add) \
        m(binary_mul) \
        m(binary_max) \
        m(binary_min) \
        m(binary_div) \
        m(resampling_nearest) \
        m(resampling_linear) \
        m(reduction_max) \
        m(reduction_min) \
        m(reduction_sum) \
        m(reduction_mul) \
        m(reduction_mean) \
        m(reduction_norm_lp_max) \
        m(reduction_norm_lp_sum) \
        m(reduction_norm_lp_power_p_max) \
        m(reduction_norm_lp_power_p_sum)
 // clang-format on
 const std::unordered_map<std::string, dnnl::algorithm>& dnnl_algo_map()
 {
    static const std::unordered_map<std::string, dnnl::algorithm> m = {
 #define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {#x, dnnl::algorithm::x},
        MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
 #undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
    };
    return m;
 }
 dnnl::algorithm to_dnnl_algo(const std::string& name)
 {
    if(dnnl_algo_map().count(name) == 0)
        MIGRAPHX_THROW("Missing dnnl algo: " + name);
    return dnnl_algo_map().at(name);
 }
 const std::unordered_map<dnnl::algorithm, std::string>& dnnl_algo_string_map()
 {
    static const std::unordered_map<dnnl::algorithm, std::string> m = {
 #define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {dnnl::algorithm::x, #x},
        MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
 #undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
    };
    return m;
 }
 std::string to_string(const dnnl::algorithm& algo)
 {
    if(dnnl_algo_string_map().count(algo) == 0)
        return "unknown_" + std::to_string(static_cast<int>(algo));
    return dnnl_algo_string_map().at(algo);
 }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/eltwise.cpp
+++ b/docker/rocm/migraphx/targets/cpu/eltwise.cpp
@ -0,0 +1,73 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_eltwise : dnnl_op<dnnl_eltwise, dnnl::eltwise_forward>
 {
    std::string algo;
    float alpha = 0;
    float beta  = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(self.reflect_base(self, f),
                         pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta")));
    }
    std::string group() const { return this->name() + "::" + algo; }
    std::string name() const { return "dnnl::eltwise"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        // Compensate for allocation
        inputs.pop_back();
        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1).packed();
        auto s = inputs.at(0);
        auto r = s;
        if(not s.packed())
            r = shape{s.type(), s.lens()};
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));
        return r;
    }
    dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {dnnl::prop_kind::forward_inference,
                to_dnnl_algo(algo),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
                alpha,
                beta};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/erf.cpp
+++ b/docker/rocm/migraphx/targets/cpu/erf.cpp
@ -0,0 +1,36 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 #include <migraphx/op/erf.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 template struct cpu_unary<op::erf>;
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/fmod.cpp
+++ b/docker/rocm/migraphx/targets/cpu/fmod.cpp
@ -0,0 +1,36 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 #include <migraphx/op/fmod.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 template struct cpu_binary<op::fmod>;
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/fuse_ops.cpp
+++ b/docker/rocm/migraphx/targets/cpu/fuse_ops.cpp
@ -0,0 +1,134 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/cpu/fuse_ops.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND);
 MIGRAPHX_PRED_MATCHER(has_post_ops, instruction_ref ins)
 {
    auto v = ins->get_operator().to_value();
    return v.contains("post_ops");
 }
 MIGRAPHX_PRED_MATCHER(without_post_ops, instruction_ref ins)
 {
    auto v = ins->get_operator().to_value();
    return v.contains("post_ops") and v["post_ops"].empty();
 }
 bool workaround_dnnl_broken_post_ops(const operation& op, const operation& post_op)
 {
    if(contains({"dnnl::dot", "dnnl::convolution"}, op.name()))
        return true;
    auto pv = post_op.to_value();
    if(not pv.at("post_ops").empty())
        return true;
    auto v         = op.to_value();
    auto last_op   = v.at("post_ops").empty() ? v : v.at("post_ops").back();
    auto algo      = last_op.contains("algo") ? last_op.at("algo").to<std::string>() : op.name();
    auto post_algo = pv["algo"].to<std::string>();
    if(starts_with(algo, "eltwise") and starts_with(post_algo, "eltwise"))
        return true;
    if(algo == post_algo)
        return true;
    return false;
 }
 operation merge_post_ops(const operation& op, const operation& post_op)
 {
    auto pv = post_op.to_value();
    auto v  = op.to_value();
    v["post_ops"].push_back({{"algo", pv["algo"]},
                             {"alpha", pv["alpha"].value_or(0.0f)},
                             {"beta", pv["beta"].value_or(0.0f)}});
    auto post_ops = pv.at("post_ops");
    for(const auto& po : post_ops)
        v["post_ops"].push_back(po);
    return make_op(op.name(), v);
 }
 struct find_post_ops
 {
    context* ctx = nullptr;
    match::any_matcher matcher() const
    {
        if(enabled(MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND{}))
            return match::name("dnnl::eltwise",
                               "dnnl::binary")(match::arg(0)(has_post_ops(), match::used_once()));
        else
        {
            auto dnnl_binary = match::name("dnnl::binary")(without_post_ops(), match::used_once());
            return match::name("dnnl::eltwise")(without_post_ops(), match::arg(0)(dnnl_binary));
        }
    }
    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins   = r.result;
        auto x_ins = ins->inputs().front();
        auto x     = x_ins->get_operator();
        if(workaround_dnnl_broken_post_ops(x, ins->get_operator()))
            return;
        auto op       = merge_post_ops(x, ins->get_operator());
        auto inputs   = x_ins->inputs();
        inputs.back() = ins->inputs().back();
        if(ins->name() == "dnnl::binary")
            inputs.insert(std::prev(inputs.end()), ins->inputs().at(1));
        auto input_shapes = to_shapes(inputs);
        auto new_shape    = try_compute_shape(op, input_shapes);
        if(new_shape.empty() or new_shape.front() != ins->get_shape())
            return;
        auto info = compile(op, *ctx, new_shape.front(), input_shapes);
        if(info.contains("impl") and starts_with(info.at("impl").to<std::string>(), "ref:"))
            return;
        m.replace_instruction(ins, op, inputs);
    }
 };
 void fuse_ops::apply(module& m) const
 {
    for(std::size_t i = 0; i < 4; i++)
    {
        match::find_matches(m, find_post_ops{ctx});
        dead_code_elimination{}.apply(m);
    }
 }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/gather.cpp
+++ b/docker/rocm/migraphx/targets/cpu/gather.cpp
@ -0,0 +1,88 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/op/gather.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_gather : auto_register_op<cpu_gather>
 {
    op::gather op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(std::vector<shape> inputs) const
    {
        // Compensate for allocation
        inputs.pop_back();
        check_shapes(inputs, *this).standard();
        return migraphx::compute_shape(op, inputs);
    }
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        std::size_t nelements = output_shape.elements();
        auto lens             = args[0].get_shape().lens();
        auto axis_dim_size    = lens[op.axis];
        lens[op.axis]         = args[1].get_shape().elements();
        shape out_comp{output_shape.type(), lens};
        visit_all(args.back(), args[0])([&](auto output, auto input) {
            args[1].visit([&](auto indices) {
                const auto* indices_ptr = indices.data();
                auto* output_ptr        = output.data();
                ctx.bulk_execute(nelements, 1024, [=](auto start, auto end) {
                    for(auto i = start; i < end; i++)
                    {
                        auto idx      = out_comp.multi(i);
                        auto in_index = indices_ptr[idx[op.axis]];
                        in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;
                        idx[op.axis]  = in_index;
                        output_ptr[i] = input(idx.begin(), idx.end());
                    }
                });
            });
        });
        return args.back();
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/gemm.cpp
+++ b/docker/rocm/migraphx/targets/cpu/gemm.cpp
@ -0,0 +1,62 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
 {
    std::vector<int> arg_map(int) const
    {
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC),
                MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS),
                MIGRAPHX_DNNL_PREFIX(ARG_BIAS)};
    }
    template <class T>
    void required(const check_shapes<T>& cs) const
    {
        cs.not_broadcasted();
    }
    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/allocation_model.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/allocation_model.hpp
@ -0,0 +1,49 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/operation.hpp>
 #include <string>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_allocation_model
 {
    std::string name() const;
    std::string copy() const;
    operation allocate(const shape& s) const;
    operation preallocate(const shape& s, const std::string& id) const;
    bool needs_out_params() const { return false; }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/context.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/context.hpp
@ -0,0 +1,58 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/cpu/parallel.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/cpu/export.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct context
 {
    void finish() const {}
    template <class F>
    void bulk_execute(std::size_t n, std::size_t min_grain, F f)
    {
        cpu::parallel_for(n, min_grain, f);
    }
    template <class F>
    void bulk_execute(std::size_t n, F f)
    {
        this->bulk_execute(n, 256, f);
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/dnnl.hpp
@ -0,0 +1,441 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <unordered_map>
 #include <migraphx/errors.hpp>
 #include <migraphx/assert.hpp>
 #ifdef MIGRAPHX_ENABLE_ZENDNN
 #include <zendnn.hpp>
 #else
 #include <dnnl.hpp>
 #endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 #ifdef MIGRAPHX_ENABLE_ZENDNN
 namespace dnnl = zendnn;
 #define MIGRAPHX_CONCAT_PREFIX(b) ZENDNN_##b // NOLINT
 #else
 #define MIGRAPHX_CONCAT_PREFIX(b) DNNL_##b // NOLINT
 #endif
 #define MIGRAPHX_DNNL_PREFIX(b) MIGRAPHX_CONCAT_PREFIX(b) // NOLINT
 struct dnnl_context
 {
    dnnl::engine engine;
    dnnl::stream stream;
    dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
 };
 dnnl_context& get_dnnl_context();
 dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
 dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
 template <class R>
 inline dnnl::memory::dims to_dnnl_dims(R&& r)
 {
    return {r.begin(), r.end()};
 }
 dnnl::memory::desc to_dnnl_memory_desc(const shape& s);
 dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a);
 dnnl::memory to_dnnl_memory(const argument& a);
 dnnl::algorithm to_dnnl_algo(const std::string& name);
 std::string to_string(const dnnl::algorithm& algo);
 struct post_op : reflect_equality<post_op>, reflect_stream<post_op>
 {
    std::string algo;
    float alpha = 0;
    float beta  = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta"));
    }
 };
 template <class F>
 struct execute_wrapper
 {
    F f;
    argument operator()(context&, const std::vector<argument>& args) const { return f(args); }
 };
 template <class F>
 execute_wrapper<F> make_execute_wrapper(F f)
 {
    return {std::move(f)};
 }
 template <class Derived, class Primitive>
 struct dnnl_op : auto_register_op<Derived>
 {
    std::vector<post_op> post_ops;
    std::function<argument(context& ctx, const std::vector<argument>& args)> execute;
    template <class Self, class F>
    static auto reflect_base(Self& self, F f)
    {
        return pack(f(self.post_ops, "post_ops"));
    }
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return reflect_base(self, f);
    }
    std::string group() const
    {
        const auto& self = static_cast<const Derived&>(*this);
        return self.name();
    }
    value attributes() const
    {
        std::vector<std::string> names;
        std::transform(post_ops.begin(), post_ops.end(), std::back_inserter(names), [](auto&& op) {
            return op.algo;
        });
        const auto& self = static_cast<const Derived&>(*this);
        auto g           = self.group();
        if(not names.empty())
            g += "<" + join_strings(names, ",") + ">";
        return {{"group", g}};
    }
    std::size_t get_extra_post_op_args() const
    {
        return std::count_if(post_ops.begin(), post_ops.end(), [](const auto& po) {
            return contains(po.algo, "binary");
        });
    }
    static std::size_t get_binary_post_op_arg(std::size_t pos)
    {
        return MIGRAPHX_DNNL_PREFIX(ARG_ATTR_MULTIPLE_POST_OP)(pos) | // NOLINT
               MIGRAPHX_DNNL_PREFIX(ARG_SRC_1);                       // NOLINT
    }
    static std::vector<shape> to_shapes(const std::vector<argument>& args)
    {
        std::vector<shape> shapes(args.size());
        std::transform(args.begin(), args.end(), shapes.begin(), [](const argument& a) {
            return a.get_shape();
        });
        return shapes;
    }
    static std::string impl(const Primitive& prim)
    {
        auto desc       = prim.get_primitive_desc();
        const char* str = nullptr;
 #ifdef MIGRAPHX_ENABLE_ZENDNN
        zendnn_primitive_desc_query(
            desc, zendnn_query_impl_info_str, 0, reinterpret_cast<void*>(&str));
 #else
        dnnl_primitive_desc_query(desc, dnnl_query_impl_info_str, 0, reinterpret_cast<void*>(&str));
 #endif
        return str == nullptr ? "" : str;
    }
    // Map arg index to arg in dnnl
    std::vector<int> arg_map(int size) const
    {
        std::vector<int> result(size);
        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0));
        return result;
    }
    shape base_adjust_shape(const shape& s, const shape& output) const
    {
        if(s.broadcasted())
        {
            auto lens    = s.lens();
            auto strides = s.strides();
            std::transform(strides.begin(),
                           strides.end(),
                           lens.begin(),
                           lens.begin(),
                           [](auto stride, auto len) -> std::size_t {
                               if(stride == 0)
                                   return 1;
                               else
                                   return len;
                           });
            // Use the permutation of the output
            return output.with_lens(s.type(), lens);
        }
        return s;
    }
    template <class F>
    void for_each_post_op(F f) const
    {
        int i = 0;
        for(auto&& op : post_ops)
        {
            if(contains(op.algo, "binary"))
            {
                f(op, get_binary_post_op_arg(i));
            }
            else
            {
                f(op, -1);
            }
            i++;
        }
    }
    shape adjust_shape(const shape& s, int, const shape& output) const
    {
        return base_adjust_shape(s, output);
    }
    std::vector<int> create_arg_map(std::size_t input_size) const
    {
        const auto& self     = static_cast<const Derived&>(*this);
        auto npost_ops       = get_extra_post_op_args();
        auto prim_input_size = input_size - npost_ops;
        auto m               = self.arg_map(prim_input_size);
        for_each_post_op([&](auto&&, auto arg) {
            if(arg < 0)
                return;
            m.push_back(arg);
        });
        return m;
    }
    std::unordered_map<int, dnnl::memory::desc>
    to_memory_desc(const shape& output_shape, const std::vector<shape>& inputs) const
    {
        const auto& self = static_cast<const Derived&>(*this);
        std::unordered_map<int, dnnl::memory::desc> result;
        result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size(), output_shape));
        auto m = create_arg_map(inputs.size());
        assert(m.size() >= inputs.size());
        for(int i = 0; i < inputs.size(); i++)
        {
            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i, output_shape));
        }
        return result;
    }
    dnnl::primitive_attr
    get_primitive_attr(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        dnnl::primitive_attr result;
        dnnl::post_ops po;
        for_each_post_op([&](auto&& op, auto arg) {
            if(contains(op.algo, "binary_add"))
            {
                auto desc = m.at(arg);
                if(desc == m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)))
                    po.append_sum(1.0f);
                else
                    po.append_binary(to_dnnl_algo(op.algo), m.at(arg));
            }
            else if(contains(op.algo, "binary"))
            {
                po.append_binary(to_dnnl_algo(op.algo), m.at(arg));
            }
            else if(contains(op.algo, "eltwise"))
                po.append_eltwise(1.0f, to_dnnl_algo(op.algo), op.alpha, op.beta);
            else
                MIGRAPHX_THROW("Unknown post op algo: " + op.algo);
        });
        result.set_post_ops(po);
        return result;
    }
    template <class T>
    auto get_primitive_desc(const T& desc, const dnnl::primitive_attr& attr) const
        -> decltype(typename Primitive::primitive_desc(desc, attr, get_dnnl_context().engine))
    {
        return typename Primitive::primitive_desc(desc, attr, get_dnnl_context().engine);
    }
    Primitive get_primitive(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        const auto& self = static_cast<const Derived&>(*this);
        auto desc        = self.get_desc(m);
        auto attr        = MIGRAPHX_ASSERT_NO_THROW(this->get_primitive_attr(m));
        auto pd          = self.get_primitive_desc(desc, attr);
        return Primitive(pd);
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
        return execute(ctx, args);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
    value compile(context&, const shape& output_shape, std::vector<shape> inputs)
    {
        // Compensate for allocation
        inputs.pop_back();
        auto md        = to_memory_desc(output_shape, inputs);
        auto prim      = get_primitive(md);
        auto impl_name = impl(prim);
        return {{"impl", impl_name}};
    }
    void finalize(context&, const shape& output_shape, std::vector<shape> inputs)
    {
        // Compensate for allocation
        inputs.pop_back();
        const auto& self = static_cast<const Derived&>(*this);
        auto name        = self.name();
        auto md          = to_memory_desc(output_shape, inputs);
        auto prim        = get_primitive(md);
        auto arg_lookup  = create_arg_map(inputs.size());
 #ifndef NDEBUG
        auto prim_attr = get_primitive_attr(md);
 #endif
        execute = make_execute_wrapper([=](const std::vector<argument>& args) {
 #ifndef NDEBUG
            // Check that the memory descriptors have not changed
            auto debug_args = args;
            debug_args.pop_back();
            auto debug_md = to_memory_desc(output_shape, to_shapes(debug_args));
            for(auto&& p : debug_md)
            {
                if(md.count(p.first) == 0)
                    MIGRAPHX_THROW(name +
                                   ": Missing memory descriptor for: " + std::to_string(p.first));
                if(p.second == md.at(p.first))
                    continue;
                MIGRAPHX_THROW(name +
                               ": Memory descriptor has changed for: " + std::to_string(p.first));
            }
            // Check post_ops args are correct
            auto pos             = prim_attr.get_post_ops();
            auto prim_input_size = inputs.size() - this->get_extra_post_op_args();
            int j                = 0;
            for(int i = 0; i < pos.len(); i++)
            {
                auto arg  = j + prim_input_size;
                auto kind = pos.kind(i);
                std::string mesg =
                    "Post op " + std::to_string(i) + "@" + std::to_string(arg) + ": ";
                try
                {
                    dnnl::algorithm algo;
                    dnnl::memory::desc mdesc;
                    float scale = 0;
                    float alpha = 0;
                    float beta  = 0;
                    if(kind == dnnl::primitive::kind::binary)
                    {
                        pos.get_params_binary(i, algo, mdesc);
                        if(mdesc != md.at(arg_lookup.at(arg)))
                            MIGRAPHX_THROW(mesg +
                                           "Memory descriptor doesn't match for binary post op");
                        j++;
                    }
                    else if(kind == dnnl::primitive::kind::eltwise)
                    {
                        pos.get_params_eltwise(i, scale, algo, alpha, beta);
                    }
                    else if(kind == dnnl::primitive::kind::sum)
                    {
                        pos.get_params_sum(i, scale);
                        algo = dnnl::algorithm::binary_add;
                    }
                    else
                    {
                        MIGRAPHX_THROW("Unknown kind");
                    }
                    if(to_dnnl_algo(post_ops[i].algo) != algo)
                        MIGRAPHX_THROW(mesg + "Algorithm doesn't match for post op " +
                                       post_ops[i].algo + " != " + to_string(algo));
                }
                catch(const dnnl::error& e)
                {
                    MIGRAPHX_THROW(mesg + "Failed to get post ops argument " + ": " + e.what());
                }
            }
 #endif
            std::unordered_map<int, dnnl::memory> m;
            m[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
                to_dnnl_memory(md.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)), args.back());
            for(int i = 0; i < args.size() - 1; i++)
                m[arg_lookup[i]] = to_dnnl_memory(md.at(arg_lookup[i]), args[i]);
            prim.execute(get_dnnl_context().stream, m);
            return args.back();
        });
    }
    std::vector<shape> trim_post_op_inputs(const std::vector<shape>& inputs) const
    {
        auto prim_input_size = inputs.size() - this->get_extra_post_op_args();
        return {inputs.begin(), inputs.begin() + prim_input_size};
    }
 };
 template <class Derived, class Primitive, class Op>
 struct dnnl_extend_op : dnnl_op<Derived, Primitive>
 {
    Op op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(self.reflect_base(self, f), migraphx::reflect(self.op, f));
    }
    // dnnl has some issues with non-packed inputs
    template <class T>
    void required(const check_shapes<T>& cs) const
    {
        cs.packed_or_broadcasted();
    }
    std::string name() const { return "dnnl::" + op.name(); }
    shape compute_shape(std::vector<shape> inputs) const
    {
        const auto& self = static_cast<const Derived&>(*this);
        // Compensate for allocation
        inputs.pop_back();
        self.required(check_shapes(inputs, self));
        auto r = migraphx::compute_shape(op, this->trim_post_op_inputs(inputs));
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));
        return r;
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
@ -0,0 +1,47 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
 #define MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
 #include <migraphx/cpu/context.hpp>
 #include <string>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace cpu {
 struct MIGRAPHX_CPU_EXPORT fuse_ops
 {
    context* ctx = nullptr;
    std::string name() const { return "cpu::fuse_ops"; }
    void apply(module& m) const;
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/lowering.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/lowering.hpp
@ -0,0 +1,46 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
 #include <migraphx/cpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace cpu {
 struct MIGRAPHX_CPU_EXPORT lowering
 {
    std::string name() const { return "cpu::lowering"; }
    void apply(module& m) const;
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/parallel.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/parallel.hpp
@ -0,0 +1,125 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
 // #define MIGRAPHX_DISABLE_OMP
 #include <cmath>
 #include <cassert>
 #include <migraphx/config.hpp>
 #ifdef MIGRAPHX_DISABLE_OMP
 #include <migraphx/par_for.hpp>
 #else
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreserved-identifier"
 #endif
 #include <omp.h>
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 #endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 #ifdef MIGRAPHX_DISABLE_OMP
 inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
 template <class F>
 void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
 {
    if(threadsize <= 1)
    {
        f(std::size_t{0}, n);
    }
    else
    {
        std::vector<joinable_thread> threads(threadsize);
 // Using const here causes gcc 5 to ICE
 #if(!defined(__GNUC__) || __GNUC__ != 5)
        const
 #endif
            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
        std::size_t work = 0;
        std::generate(threads.begin(), threads.end(), [=, &work] {
            auto result = joinable_thread([=]() mutable {
                assert(work < n);
                f(work, std::min(n, work + grainsize));
            });
            work += grainsize;
            return result;
        });
        // cppcheck-suppress unsignedLessThanZero
        assert(work >= n);
    }
 }
 #else
 inline std::size_t max_threads() { return omp_get_max_threads(); }
 template <class F>
 void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
 {
    if(threadsize <= 1)
    {
        f(std::size_t{0}, n);
    }
    else
    {
        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
 #pragma omp parallel for num_threads(threadsize) schedule(static, 1)
        for(std::size_t tid = 0; tid < threadsize; tid++)
        {
            std::size_t work = tid * grainsize;
            assert(work < n);
            f(work, std::min(n, work + grainsize));
        }
    }
 }
 #endif
 template <class F>
 void parallel_for(std::size_t n, std::size_t min_grain, F f)
 {
    const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
    parallel_for_impl(n, threadsize, f);
 }
 template <class F>
 void parallel_for(std::size_t n, F f)
 {
    const int min_grain = 8;
    parallel_for(n, min_grain, f);
 }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/pointwise.hpp
@ -0,0 +1,414 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
 #include <array>
 #include <migraphx/config.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/register_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct multi_index
 {
    constexpr multi_index() = default;
    multi_index(const shape& s, std::size_t i) : n(s.lens().size())
    {
        assert(n < max_size);
        std::copy(s.lens().begin(), s.lens().end(), dims);
        s.multi_copy(i, index, index + max_size);
    }
    constexpr std::size_t size() const { return n; }
    constexpr std::size_t* begin() { return index; }
    constexpr const std::size_t* begin() const { return index; }
    constexpr std::size_t* end() { return index + size(); }
    constexpr const std::size_t* end() const { return index + size(); }
    std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
    constexpr void carry()
    {
        std::size_t overflow = 0;
        for(std::ptrdiff_t i = size() - 1; i > 0; i--)
        {
            auto z = index[i] + overflow;
            // Reset overflow
            overflow = 0;
            // Compute overflow using while loop instead of mod
            // overflow = z / dims[i];
            // z = z % dims[i];
            while(z >= dims[i])
            {
                z -= dims[i];
                overflow += 1;
            }
            index[i] = z;
            // Exit if there is no overflow
            if(overflow == 0)
                return;
        }
        index[0] += overflow;
    }
    constexpr void increment(std::size_t i)
    {
        index[size() - 1] += i;
        carry();
    }
    constexpr multi_index& operator+=(std::size_t i)
    {
        increment(i);
        return *this;
    }
    constexpr multi_index& operator++()
    {
        increment(1);
        return *this;
    }
    multi_index operator++(int) // NOLINT
    {
        multi_index result = *this;
        increment(1);
        return result;
    }
    private:
    static const std::size_t max_size = 5;
    std::size_t index[max_size]       = {};
    std::size_t dims[max_size]        = {};
    std::size_t n                     = 0;
 };
 struct reduce_dims_base
 {
    std::vector<shape> reduce_shapes;
    void finalize(context&, const shape&, const std::vector<shape>& inputs)
    {
        reduce_shapes = reduce_dims(inputs);
    }
    argument get_arg(const std::vector<argument>& args, std::size_t i) const
    {
        if(reduce_shapes.empty())
            return args[i];
        return args.at(i).reshape(reduce_shapes.at(i));
    }
    argument get_output() const
    {
        argument a{reduce_shapes[0]};
        return a;
    }
 };
 template <class T, std::size_t N>
 struct vec
 {
    using array_type                                              = std::array<T, N>;
    using vector_type __attribute__((vector_size(N * sizeof(T)))) = T;
    union
    {
        array_type array;
        vector_type vector;
    };
    static_assert(sizeof(array_type) == sizeof(vector_type), "Not the same size");
 };
 template <class T>
 constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
 {
    return {};
 }
 template <class T, std::size_t N>
 constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
 {
    return {};
 }
 template <class T>
 constexpr std::size_t vec_size()
 {
    return decltype(vec_size(std::declval<T>())){};
 }
 template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() > 0))>
 void vec_apply(F f, V& v, Vs... vs)
 {
    assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
    assert(vec_size<V>() == v.array.size());
    for(std::size_t i = 0; i < vec_size<V>(); i++)
        f(v.array[i], vs.vector[i]...);
 }
 template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() == 0))>
 void vec_apply(F f, V& v, Vs&... vs)
 {
    f(v, vs...);
 }
 inline std::size_t find_packed_len(const shape& s)
 {
    for(std::size_t i = 0; i < s.lens().size(); i++)
    {
        if(s.lens()[i] > 1 and s.strides()[i] == 1)
        {
            return i;
        }
    }
    return -1;
 }
 template <std::size_t N>
 shape vectorize(const shape& s)
 {
    assert(s.standard() or s.broadcasted());
    auto lens = s.lens();
    if(s.broadcasted())
    {
        auto n = find_packed_len(s);
        assert(n != -1);
        assert((lens[n] % N) == 0);
        lens[n] /= N;
        return {s.type(), lens, s.strides()};
    }
    assert((lens.back() % N) == 0);
    lens.back() /= N;
    return {s.type(), lens};
 }
 template <std::size_t N, class T>
 tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
 {
    return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
 }
 template <class T>
 struct is_vector_type : std::false_type
 {
 };
 template <>
 struct is_vector_type<float> : std::true_type
 {
 };
 template <class... Ts>
 struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...>
 {
 };
 template <std::size_t N, class... Xs>
 bool is_vectorizable(const Xs&... xs)
 {
    return all_of({xs...}, [](const auto& s) {
        if(s.standard() and (s.lens().back() % N) == 0)
            return true;
        if(s.broadcasted())
        {
            auto n = std::inner_product(s.lens().begin(),
                                        s.lens().end(),
                                        s.strides().begin(),
                                        0,
                                        std::plus<>{},
                                        [&](auto len, auto stride) -> std::size_t {
                                            if(stride > 0 and len == 1)
                                                return 0;
                                            return stride;
                                        });
            if(n == 1)
            {
                auto i = find_packed_len(s);
                assert(i != -1);
                return (s.lens()[i] % N) == 0;
            }
        }
        return false;
    });
 }
 template <class... Ts, MIGRAPHX_REQUIRES(is_vector_tensor_view<Ts...>{})>
 auto auto_vectorize(const shape& base_shape, Ts... xs)
 {
    return [=](auto f) {
        if(is_vectorizable<32>(base_shape, xs.get_shape()...))
            f(vectorize<32>(base_shape), vectorize<32>(xs)...);
        else if(is_vectorizable<8>(base_shape, xs.get_shape()...))
            f(vectorize<8>(base_shape), vectorize<8>(xs)...);
        else
            f(base_shape, xs...);
    };
 }
 template <class... Ts, MIGRAPHX_REQUIRES(not is_vector_tensor_view<Ts...>{})>
 auto auto_vectorize(const shape& base_shape, Ts... xs)
 {
    return [=](auto f) { f(base_shape, xs...); };
 }
 template <class X, class... Xs>
 bool is_standard_offset(const X& x, const Xs&... xs)
 {
    if(all_of({x, xs...}, [](const auto& s) { return s.standard(); }))
        return true;
    if(all_of({x, xs...}, [](const auto& s) { return s.packed(); }) and
       all_of({xs...}, [&](const auto& s) { return s == x; }))
        return true;
    return false;
 }
 template <class... Ts>
 auto pointwise_apply(Ts... ts)
 {
    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
        if(is_standard_offset(ts.get_shape()...))
        {
            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
                for(auto i = start; i < end; i++)
                {
                    vec_apply(f, ts.data()[i]...);
                }
            });
        }
        else
        {
            assert(base_shape.lens().size() <= 6);
            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
                multi_index mi(base_shape, start);
                for(auto i = start; i < end; i++)
                {
                    vec_apply(f, ts.data()[mi.offset(ts.get_shape())]...);
                    ++mi;
                }
            });
        }
    };
 }
 template <class... Ts>
 auto pointwise(Ts... ts)
 {
    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
        auto_vectorize(base_shape, ts...)(
            [&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
    };
 }
 template <class Op>
 struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
 {
    Op op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);
        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
            auto op2 = op;
            pointwise(output, input)(
                ctx, output.get_shape(), 1024, [op2](auto& y, auto x) { y = op2.apply()(x); });
        });
        return result.reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 template <class Op>
 struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
 {
    Op op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(3);
        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);
        visit_all(result, get_arg(args, 0), get_arg(args, 1))(
            [&](auto output, auto input1, auto input2) {
                auto op2 = op;
                pointwise(output, input1, input2)(
                    ctx, output.get_shape(), 1024, [op2](auto& z, auto x, auto y) {
                        z = op2.apply()(x, y);
                    });
            });
        return result.reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/target.hpp
@ -0,0 +1,51 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
 #define MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
 #include <migraphx/program.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/cpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct pass;
 namespace cpu {
 struct MIGRAPHX_CPU_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }
    argument copy_to(const argument& arg) const { return arg; }
    argument copy_from(const argument& arg) const { return arg; }
    argument allocate(const shape& s) const;
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/write_literals.hpp
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/write_literals.hpp
@ -0,0 +1,45 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
 #include <migraphx/config.hpp>
 #include <string>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace cpu {
 struct write_literals
 {
    std::string name() const { return "cpu::write_literals"; }
    void apply(module& m) const;
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/cpu/layernorm.cpp
+++ b/docker/rocm/migraphx/targets/cpu/layernorm.cpp
@ -0,0 +1,65 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_layernorm : dnnl_op<dnnl_layernorm, dnnl::layer_normalization_forward>
 {
    float epsilon = 1e-12f;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.epsilon, "epsilon"));
    }
    std::string name() const { return "dnnl::layernorm"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        // Compensate for allocation
        inputs.pop_back();
        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1);
        auto s = inputs.at(0);
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(s, inputs));
        return s;
    }
    dnnl::layer_normalization_forward::desc
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {dnnl::prop_kind::forward_inference,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                1e-12f,
                dnnl::normalization_flags::none};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/logsoftmax.cpp
+++ b/docker/rocm/migraphx/targets/cpu/logsoftmax.cpp
@ -0,0 +1,44 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/logsoftmax.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forward, op::logsoftmax>
 {
    dnnl::logsoftmax_forward::desc
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/lowering.cpp
+++ b/docker/rocm/migraphx/targets/cpu/lowering.cpp
@ -0,0 +1,502 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
 #include <migraphx/op/logsoftmax.hpp>
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
 #include <migraphx/op/softmax.hpp>
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
 #include <migraphx/op/mod.hpp>
 #include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>
 #include <migraphx/clamp.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/tune_axis.hpp>
 #include <migraphx/match/layernorm.hpp>
 #include <migraphx/match/gelu_erf.hpp>
 #include <migraphx/match/gelu_tanh.hpp>
 #include <migraphx/matcher.hpp>
 #include <unordered_map>
 #include <utility>
 #include <iostream>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 template <typename T>
 T zero(const T&)
 {
    return T(0);
 }
 template <class T>
 typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::enable_if<true, T>>::
    type
    make_signed(T x)
 {
    return x;
 }
 struct cpu_im2col
 {
    op::im2col op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        return op.normalize_compute_shape(inputs);
    }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto input_shape   = args[0].get_shape();
        auto weights_shape = args[1].get_shape();
        visit_all(result, args[0])([&](auto col, auto input) {
            const std::size_t& height   = input_shape.lens()[2];
            const std::size_t& width    = input_shape.lens()[3];
            const std::size_t& channels = weights_shape.lens()[1];
            const std::size_t& kernel_h = weights_shape.lens()[2];
            const std::size_t& kernel_w = weights_shape.lens()[3];
            const std::size_t& pad_h    = op.padding[0];
            const std::size_t& pad_w    = op.padding[1];
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];
            long kdiv2_h = long(kernel_h) / 2;
            long kdiv2_w = long(kernel_w) / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
            // account for padding for the starting position of the input pixels
            long iinput = kdiv2_h - long(pad_h);
            // loop over output pixels (ioutput, joutput)
            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
            {
                long jinput = kdiv2_w - long(pad_w);
                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
                {
                    // compute linear index for output
                    std::size_t ldx = ioutput * col_width + joutput;
                    std::size_t p   = 0;
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
                        auto idx    = iinput + long(koffset) - kdiv2_h;
                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) =
                            ((idx >= 0) and (idx < height) and (jdx >= 0) and (jdx < width))
                                ? input(0, c, idx, jdx)
                                : 0;
                        p++;
                    });
                }
            }
        });
        return result;
    }
 };
 MIGRAPHX_REGISTER_OP(cpu_im2col)
 struct cpu_op
 {
    operation op = op::identity{};
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::op"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
    {
        return op.compute(output_shape, args);
    }
    value to_value() const
    {
        value v;
        v["name"]     = op.name();
        v["operator"] = op.to_value();
        return v;
    }
    void from_value(const value& v)
    {
        op = make_op(v.at("name").to<std::string>(), v.at("operator"));
    }
    friend std::ostream& operator<<(std::ostream& os, const cpu_op& x)
    {
        os << "cpu::" << x.op;
        return os;
    }
 };
 MIGRAPHX_REGISTER_OP(cpu_op)
 struct cpu_pad
 {
    op::pad op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::pad"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        assert(output_shape.standard());
        argument result{output_shape};
        result.visit([&](auto output) {
            using type = typename decltype(output)::value_type;
            std::fill(output.begin(), output.end(), pad_clamp<type>(op.value));
        });
        visit_all(result, args[0])([&](auto output, auto input) {
            shape_for_each(input.get_shape(), [&](const auto& idx) {
                std::vector<std::size_t> new_idx(idx.size());
                std::transform(
                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
                        return i + j;
                    });
                output(new_idx.begin(), new_idx.end()) = input(idx.begin(), idx.end());
            });
        });
        return result;
    }
 };
 MIGRAPHX_REGISTER_OP(cpu_pad)
 struct cpu_rnn_var_sl_last_output
 {
    op::rnn_var_sl_last_output op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "cpu::rnn_var_sl_last_output"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        return op.compute_shape(std::move(inputs));
    }
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto out_comp_lens = args[0].get_shape().lens();
        out_comp_lens[0]   = 1;
        shape out_comp_s{output_shape.type(), out_comp_lens};
        visit_all(result, args[0])([&](auto output, auto input) {
            args[1].visit([&](auto seq_lens) {
                par_for(output_shape.elements(), [&](auto i) {
                    auto idx = out_comp_s.multi(i);
                    auto b   = idx[2];
                    if(op.direction == op::rnn_direction::reverse or idx[1] == 1)
                    {
                        idx[0] = 0;
                    }
                    else
                    {
                        idx[0] = seq_lens[b] - 1;
                    }
                    output[i] = input(idx.begin(), idx.end());
                });
            });
        });
        return result;
    }
 };
 MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
 struct cpu_apply
 {
    module* modl;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
    instruction_ref last{};
    void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
    {
        apply_map.emplace(op_name, [=](instruction_ref ins) {
            auto&& op = ins->get_operator();
            if(allocate)
                return replace(ins, make_op(cpu_name, op.to_value()));
            return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs());
        });
    }
    void extend_dnnl_algos(const std::string& dnnl_name,
                           const std::vector<std::pair<std::string, std::string>>& algos)
    {
        for(auto&& pp : algos)
        {
            std::string op_name = pp.first;
            std::string algo    = pp.second;
            apply_map.emplace(op_name, [=](instruction_ref ins) {
                auto v = ins->get_operator().to_value();
                if(not v.is_object())
                    return ins;
                v["algo"] = algo;
                auto op   = make_op(dnnl_name, v);
                return replace(ins, op);
            });
        }
    }
    template <class M>
    auto fuse_match(M matcher, const operation& op, const std::vector<std::string>& bind_inputs)
    {
        return match::make_match_finder(matcher, [=](auto&, const auto& r) {
            auto ins = r.result;
            std::vector<instruction_ref> inputs;
            std::transform(bind_inputs.begin(),
                           bind_inputs.end(),
                           std::back_inserter(inputs),
                           [&](const auto& s) { return r.instructions[s]; });
            inputs.push_back(this->insert_allocation(ins, ins->get_shape()));
            modl->replace_instruction(ins, op, inputs);
        });
    }
    void init()
    {
        extend_dnnl_algos("dnnl::binary",
                          {
                              {"add", "binary_add"},
                              {"div", "binary_div"},
                              {"max", "binary_max"},
                              {"min", "binary_min"},
                              {"mul", "binary_mul"},
                          });
        extend_dnnl_algos("dnnl::eltwise",
                          {
                              {"abs", "eltwise_abs"},
                              {"elu", "eltwise_elu"},
                              {"exp", "eltwise_exp"},
                              {"log", "eltwise_log"},
                              {"relu", "eltwise_relu"},
                              {"sqrt", "eltwise_sqrt"},
                              {"tanh", "eltwise_tanh"},
                          });
        extend_dnnl_algos("dnnl::reduction",
                          {
                              {"reduce_max", "reduction_max"},
                              {"reduce_mean", "reduction_mean"},
                              {"reduce_min", "reduction_min"},
                              {"reduce_sum", "reduction_sum"},
                          });
        extend_op("concat", "dnnl::concat");
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
 #ifndef MIGRAPHX_ENABLE_ZENDNN
        extend_op("convolution_backwards", "dnnl::convolution_backwards");
        extend_op("dot", "dnnl::dot");
 #endif
        extend_op("erf", "cpu::erf");
        extend_op("gather", "cpu::gather");
        extend_op("logsoftmax", "dnnl::logsoftmax");
        extend_op("lrn", "dnnl::lrn");
        extend_op("softmax", "dnnl::softmax");
        extend_op("im2col", "cpu::im2col", false);
        extend_op("leaky_relu", "cpu::leaky_relu", false);
        extend_op("pad", "cpu::pad", false);
        extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output", false);
    }
    void apply()
    {
        init();
        // Apply fusion matchers first
        match::find_matches(*modl,
                            fuse_match(match::gelu_erf(),
                                       make_op("dnnl::eltwise", {{"algo", "eltwise_gelu_erf"}}),
                                       {"x"}),
                            fuse_match(match::gelu_tanh(),
                                       make_op("dnnl::eltwise", {{"algo", "eltwise_gelu_tanh"}}),
                                       {"x"}),
                            fuse_match(match::layernorm(), make_op("dnnl::layernorm"), {"x"}));
        // Apply these operators first so the inputs can be const folded
        for(auto it : iterator_for(*modl))
        {
            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
            // supported yet.
            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
                   return contains(fp8_types{}.get(), i->get_shape().type());
               }))
                continue;
            if(it->name() == "pow")
            {
                apply_pow(it);
            }
        }
        for(auto it : iterator_for(*modl))
        {
            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
            // supported yet.
            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
                   return contains(fp8_types{}.get(), i->get_shape().type());
               }))
                continue;
            if(it->name() == "pooling")
            {
                apply_pooling(it);
            }
            else if(it->name() == "reshape")
            {
                apply_reshape(it);
            }
            else if(apply_map.count(it->name()) > 0)
            {
                apply_map.at(it->name())(it);
            }
        }
    }
    instruction_ref apply_pow(instruction_ref ins) const
    {
        auto beta = read_scalar<float>(ins->inputs()[1]);
        if(beta.empty())
            return ins;
        return replace(ins,
                       make_op("dnnl::eltwise",
                               {{"algo", "eltwise_pow"}, {"alpha", 1.0}, {"beta", beta.front()}}),
                       {ins->inputs().front()});
    }
    // TODO:  update lowering to run the reference
    // code when OneDNN can't execute pooling for a CPU
    // OneDNN has a limitation on padding size for pooling.  see
    // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
    // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
    // padding = {2}; stride = {1}; lengths = {2} fails.
    // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
    // ("weights tensor") that MIGraphX doesn't enforce.
    instruction_ref apply_pooling(instruction_ref ins) const
    {
        auto&& op = ins->get_operator();
        auto v    = op.to_value();
        if(has_op("dnnl::pooling") and ins->get_shape().type() == shape::type_t::float_type and
           not v["ceil_mode"].to<bool>() and
           v["mode"].to<op::pooling_mode>() != op::pooling_mode::lpnorm)
            return replace(ins, make_op("dnnl::pooling", op.to_value()));
        return ins;
    }
    /*
    Lowers reshape copy operator to reshape lazy by inserting contiguous operators around it.
    Contiguous ops will later by removed by eliminate_contiguous pass.
    */
    instruction_ref apply_reshape(instruction_ref ins) const
    {
        std::vector<instruction_ref> before_contiguous_args = ins->inputs();
        auto before_alloc =
            insert_allocation(ins, before_contiguous_args.front()->get_shape().as_standard());
        before_contiguous_args.push_back(before_alloc);
        auto before_contig =
            modl->insert_instruction(ins, make_op("dnnl::reorder"), {before_contiguous_args});
        auto new_lazy_reshape = modl->insert_instruction(
            ins,
            make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value().at("dims")}}}),
            before_contig);
        std::vector<instruction_ref> after_contiguous_args = {new_lazy_reshape};
        auto after_alloc = insert_allocation(new_lazy_reshape, new_lazy_reshape->get_shape());
        after_contiguous_args.push_back(after_alloc);
        return modl->replace_instruction(ins, make_op("dnnl::reorder"), after_contiguous_args);
    }
    template <class T>
    static std::vector<T> read_scalar(instruction_ref ins)
    {
        if(ins->name() == "contiguous")
            return read_scalar<T>(ins->inputs().front());
        if(ins->get_shape().elements() != 1 and not ins->get_shape().scalar())
            return {};
        auto r = ins->eval();
        if(r.empty())
            return {};
        return {r.at<T>()};
    }
    instruction_ref replace(instruction_ref ins, const operation& op) const
    {
        return replace(ins, op, ins->inputs());
    }
    instruction_ref
    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs) const
    {
        inputs.push_back(insert_allocation(ins, ins->get_shape()));
        return modl->replace_instruction(ins, op, inputs);
    }
    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
    {
        return modl->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
    }
 };
 void lowering::apply(module& m) const { cpu_apply{&m}.apply(); }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/lrn.cpp
+++ b/docker/rocm/migraphx/targets/cpu/lrn.cpp
@ -0,0 +1,48 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/lrn.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
 {
    dnnl::lrn_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {dnnl::prop_kind::forward_inference,
                dnnl::algorithm::lrn_across_channels,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
                this->op.size,
                this->op.alpha,
                this->op.beta,
                this->op.bias};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/mod.cpp
+++ b/docker/rocm/migraphx/targets/cpu/mod.cpp
@ -0,0 +1,36 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 #include <migraphx/op/mod.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 template struct cpu_binary<op::mod>;
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/pooling.cpp
+++ b/docker/rocm/migraphx/targets/cpu/pooling.cpp
@ -0,0 +1,83 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/pooling.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_v2_forward, op::pooling>
 {
    std::vector<int> arg_map(int) const { return {MIGRAPHX_DNNL_PREFIX(ARG_SRC)}; }
    dnnl::algorithm get_algo() const
    {
        switch(op.mode)
        {
        case op::pooling_mode::max: return dnnl::algorithm::pooling_max;
        case op::pooling_mode::average:
            return op.count_include_pad ? dnnl::algorithm::pooling_avg_include_padding
                                        : dnnl::algorithm::pooling_avg_exclude_padding;
        case op::pooling_mode::lpnorm: MIGRAPHX_THROW("Lpnorn pooling mode not supported");
        }
        MIGRAPHX_THROW("Unknown pooling mode");
    }
    dnnl::pooling_v2_forward::desc
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        auto algo  = get_algo();
        auto kdims = op.kdims();
        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
        // Note: It is not documented, but the default dilation seems to be 0 instead of 1.
        //       We need to offset dilations with -1.
        std::vector<size_t> dilations;
        std::transform(op.dilations.cbegin(),
                       op.dilations.cend(),
                       std::back_inserter(dilations),
                       [](size_t d) { return d - 1; });
        return {dnnl::prop_kind::forward_inference,
                algo,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                to_dnnl_dims(op.stride),
                to_dnnl_dims(op.lengths),
                to_dnnl_dims(dilations),
                to_dnnl_dims(padding_l),
                to_dnnl_dims(padding_r)};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/preallocate.cpp
+++ b/docker/rocm/migraphx/targets/cpu/preallocate.cpp
@ -0,0 +1,60 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/register_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_preallocate : auto_register_op<cpu_preallocate>
 {
    shape s;
    std::string id = "";
    argument data;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.s, "shape"), f(self.id, "id"));
    }
    std::string name() const { return "cpu::preallocate"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(0);
        return s;
    }
    argument compute(context&, const shape&, const std::vector<argument>&) const { return data; }
    void finalize(context&, const shape&, const std::vector<shape>&) { data = argument(s); }
    lifetime get_lifetime() const { return lifetime::global; }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/reduction.cpp
+++ b/docker/rocm/migraphx/targets/cpu/reduction.cpp
@ -0,0 +1,73 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_reduction : dnnl_op<dnnl_reduction, dnnl::reduction>
 {
    std::string algo;
    std::vector<std::int64_t> axes{};
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(self.reflect_base(self, f),
                         pack(f(self.algo, "algo"), f(self.axes, "axes")));
    }
    std::string name() const { return "dnnl::reduction"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        // Compensate for allocation
        inputs.pop_back();
        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1).standard();
        auto s    = inputs.at(0);
        auto lens = s.lens();
        for(auto axis : axes)
        {
            lens[axis] = 1;
        }
        auto r = shape{s.type(), lens};
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));
        return r;
    }
    dnnl::reduction::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {to_dnnl_algo(algo),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                0,
                0};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/reorder.cpp
+++ b/docker/rocm/migraphx/targets/cpu/reorder.cpp
@ -0,0 +1,65 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
 {
    std::string name() const { return "dnnl::reorder"; }
    shape adjust_shape(const shape& x, int, const shape&) const { return x; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
        auto r = inputs.back();
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));
        return r;
    }
    // Custom desc class since its missing in dnnl
    struct desc
    {
        dnnl::memory::desc src;
        dnnl::memory::desc dst;
    };
    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)), m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
    }
    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const
    {
        auto& engine = get_dnnl_context().engine;
        return dnnl::reorder::primitive_desc(engine, d.src, engine, d.dst, attr);
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/softmax.cpp
+++ b/docker/rocm/migraphx/targets/cpu/softmax.cpp
@ -0,0 +1,43 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/op/softmax.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_softmax : dnnl_extend_op<dnnl_softmax, dnnl::softmax_forward, op::softmax>
 {
    dnnl::softmax_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/sub.cpp
+++ b/docker/rocm/migraphx/targets/cpu/sub.cpp
@ -0,0 +1,36 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
 #include <migraphx/op/sub.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 template struct cpu_binary<op::sub>;
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/target.cpp
+++ b/docker/rocm/migraphx/targets/cpu/target.cpp
@ -0,0 +1,122 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
 #include <migraphx/eliminate_contiguous.hpp>
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
 #include <migraphx/eliminate_convert.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/cpu/fuse_ops.hpp>
 #include <migraphx/cpu/write_literals.hpp>
 #include <migraphx/cpu/allocation_model.hpp>
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/pass.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/normalize_ops.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 std::string target::name() const { return "cpu"; }
 // cppcheck-suppress constParameterReference
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
 {
    auto& ctx = any_cast<context>(gctx);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    std::set<std::string> unsupported_ops{
        "all", "scatternd_add", "scatternd_mul", "scatternd_none"};
    unsupported_types.erase(shape::type_t::float_type);
    return {normalize_ops{},
            rewrite_quantization{},
            dead_code_elimination{},
            eliminate_data_type{unsupported_types, shape::type_t::float_type, unsupported_ops},
            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_convert{},
            eliminate_identity{},
            eliminate_pad{},
            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
            eliminate_common_subexpression{},
            dead_code_elimination{},
            simplify_algebra{},
            simplify_reshapes{},
            eliminate_convert{},
            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_convert{},
            dead_code_elimination{},
            simplify_algebra{},
            simplify_reshapes{},
            eliminate_convert{},
            dead_code_elimination{},
            propagate_constant{},
            dead_code_elimination{},
            auto_contiguous{},
            lowering{},
            eliminate_contiguous{"dnnl::reorder"},
            dead_code_elimination{},
            replace_allocate{cpu_allocation_model{}},
            dead_code_elimination{},
            adjust_allocation{cpu_allocation_model{}},
            dead_code_elimination{},
            fuse_ops{&ctx},
            dead_code_elimination{},
            write_literals{},
            dead_code_elimination{},
            memory_coloring{"cpu::allocate"},
            dead_code_elimination{},
            preallocate_param{"scratch", cpu_allocation_model{}},
            dead_code_elimination{}};
 }
 argument target::allocate(const shape& s) const { return fill_argument(s, 0); }
 MIGRAPHX_REGISTER_TARGET(target);
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/cpu/write_literals.cpp
+++ b/docker/rocm/migraphx/targets/cpu/write_literals.cpp
@ -0,0 +1,70 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/cpu/write_literals.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/register_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct cpu_literal
 {
    argument data;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.data, "data"));
    }
    std::string name() const { return "cpu::literal"; }
    shape compute_shape(const std::vector<shape>&) const { return data.get_shape(); }
    argument compute(const shape&, const std::vector<argument>&) const { return data; }
    friend std::ostream& operator<<(std::ostream& os, const cpu_literal& x)
    {
        os << x.name();
        return os;
    }
 };
 MIGRAPHX_REGISTER_OP(cpu_literal);
 void write_literals::apply(module& m) const
 {
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "@literal")
            continue;
        m.replace_instruction(ins, cpu_literal{ins->get_literal().get_argument()});
    }
 }
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/fpga/CMakeLists.txt
+++ b/docker/rocm/migraphx/targets/fpga/CMakeLists.txt
@ -0,0 +1,43 @@
 #####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
 add_library(migraphx_fpga
    target.cpp
    lowering.cpp
    subgraph.cpp
    vitis_ai_adapter.cpp
 )
 set_target_properties(migraphx_fpga PROPERTIES EXPORT_NAME fpga)
 rocm_set_soversion(migraphx_fpga ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_fpga)
 target_link_libraries(migraphx_fpga migraphx)
 rocm_install_targets(
    PRIVATE
    TARGETS migraphx_fpga
    INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
--- a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/context.hpp
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/context.hpp
@ -0,0 +1,45 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
 #define MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
 #include <migraphx/config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 struct context
 {
    int id = 0;
    void finish() const {}
 };
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
--- a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/lowering.hpp
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/lowering.hpp
@ -0,0 +1,47 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_FPGA_LOWERING_HPP
 #define MIGRAPHX_GUARD_FPGA_LOWERING_HPP
 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/fpga/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 struct lowering
 {
    context* ctx = nullptr;
    std::string name() const { return "fpga::lowering"; }
    void apply(module& m) const;
 };
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_FPGA_LOWERING_HPP
--- a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/subgraph.hpp
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/subgraph.hpp
@ -0,0 +1,45 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
 #define MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 struct subgraph
 {
    std::string name() const { return "fpga::subgraph"; }
    void apply(module_pass_manager& mpm) const;
 };
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
--- a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/target.hpp
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/target.hpp
@ -0,0 +1,55 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_FPGA_TARGET_HPP
 #define MIGRAPHX_GUARD_FPGA_TARGET_HPP
 #include <migraphx/program.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/fpga/context.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/supported_segments.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct pass;
 namespace fpga {
 struct target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }
    supported_segments find_supported(const_module_ref mod, support_metric m) const;
    argument copy_to(const argument& arg) const { return arg; }
    argument copy_from(const argument& arg) const { return arg; }
    argument allocate(const shape& s) const;
 };
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_FPGA_TARGET_HPP
--- a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
@ -0,0 +1,52 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
 #define MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
 #include <string>
 #include <migraphx/instruction.hpp>
 #include <migraphx/pass_manager.hpp>
 namespace vitis_ai {
 class x_model
 {
    migraphx::shape shape;
    public:
    migraphx::shape get_shape() const;
    void set_shape(migraphx::shape);
 };
 x_model create_xmodel(migraphx::const_module_ref mod);
 migraphx::argument execute(const x_model& xmodel,
                           const migraphx::shape& output_shape,
                           std::vector<migraphx::argument>& args);
 } // namespace vitis_ai
 #endif // MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
--- a/docker/rocm/migraphx/targets/fpga/lowering.cpp
+++ b/docker/rocm/migraphx/targets/fpga/lowering.cpp
@ -0,0 +1,91 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/fpga/lowering.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/stringutils.hpp>
 #include <iostream>
 #include "migraphx/fpga/vitis_ai_adapter.hpp"
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 struct fpga_vitis_op
 {
    fpga_vitis_op() = default;
    explicit fpga_vitis_op(vitis_ai::x_model model) : xmodel(std::move(model)){};
    vitis_ai::x_model xmodel;
    int dummy = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        // return pack(f(self.xmodel, "xmodel"));
        return pack(f(self.dummy, "dummy"));
    }
    std::string name() const { return "fpga::vitis_ai"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        (void)inputs;
        return xmodel.get_shape();
    }
    argument
    compute(const context& ctx, const shape& output_shape, std::vector<argument> args) const
    {
        std::cout << "The context is " << ctx.id << std::endl;
        return ::vitis_ai::execute(xmodel, output_shape, args);
    }
 };
 MIGRAPHX_REGISTER_OP(fpga_vitis_op)
 void lowering::apply(module& m) const
 {
    auto* mod = &m;
    // test modifying the context from a pass
    ctx->id = 2;
    for(auto it : iterator_for(*mod))
    {
        if(it->name() == "fpga::vitis_placeholder")
        {
            assert(it->module_inputs().size() == 1);
            auto xmodel = ::vitis_ai::create_xmodel(it->module_inputs()[0]);
            mod->replace_instruction(it, fpga_vitis_op{xmodel}, it->inputs());
        }
    }
 }
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/fpga/subgraph.cpp
+++ b/docker/rocm/migraphx/targets/fpga/subgraph.cpp
@ -0,0 +1,133 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/fpga/subgraph.hpp>
 #include <migraphx/instruction.hpp>
 #include "migraphx/iterator.hpp"
 #include <migraphx/iterator_for.hpp>
 #include "migraphx/make_op.hpp"
 #include "migraphx/module.hpp"
 #include "migraphx/ranges.hpp"
 #include <migraphx/register_op.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 struct fpga_placeholder_op
 {
    fpga_placeholder_op() = default;
    int dummy = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.dummy, "dummy"));
    }
    std::string name() const { return "fpga::vitis_placeholder"; }
    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
    {
        (void)inputs;
        if(mods.size() != 1)
        {
            MIGRAPHX_THROW("should have one submodule.");
        }
        module_ref sm = mods.front();
        if(sm->get_output_shapes().size() != 1)
            MIGRAPHX_THROW("Only one return");
        return sm->get_output_shapes().front();
    }
 };
 MIGRAPHX_REGISTER_OP(fpga_placeholder_op)
 bool is_fpga_instr(migraphx::instruction_ref it)
 {
    // assuming all instructions that aren't @param, @literal, or input data are fpga instrs
    if(migraphx::starts_with(it->name(), "@"))
    {
        return false;
    }
    // no inputs to the instr means it's input data
    if(it->inputs().empty())
    {
        return false;
    }
    return true;
 }
 void subgraph::apply(module_pass_manager& mpm) const
 {
    auto& mod = mpm.get_module();
    auto* pm  = mpm.create_module(mod.name() + ":fpga");
    pm->set_bypass();
    migraphx::instruction_ref first = mod.end();
    migraphx::instruction_ref last;
    std::vector<migraphx::instruction_ref> literal_inputs;
    for(auto it : iterator_for(mod))
    {
        // assuming we want all the params/literals as inputs to the FPGA submodule
        if(migraphx::starts_with(it->name(), "@param") or
           migraphx::starts_with(it->name(), "@literal"))
        {
            literal_inputs.push_back(it);
        }
        if(is_fpga_instr(it))
        {
            if(first == mod.end())
            {
                first = it;
            }
            last = it;
        }
    }
    // TODO(varunsh): this code may be replaceable by code in the fuse_pointwise pass
    // assuming all FPGA instructions are in one contiguous range
    pm->insert_instructions(pm->end(), first, std::next(last), {});
    migraphx::instruction_ref placeholder_ins;
    for(auto it : iterator_for(mod))
    {
        if(migraphx::starts_with(it->name(), "@return"))
        {
            placeholder_ins = mod.insert_instruction(
                it, migraphx::make_op("fpga::vitis_placeholder"), literal_inputs, {pm});
            break;
        }
    }
    mod.replace_return({placeholder_ins});
 }
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/fpga/target.cpp
+++ b/docker/rocm/migraphx/targets/fpga/target.cpp
@ -0,0 +1,83 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/fpga/target.hpp>
 #include <migraphx/fpga/lowering.hpp>
 #include <migraphx/fpga/subgraph.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/pass.hpp>
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/eliminate_pad.hpp>
 #include <migraphx/insert_pad.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/normalize_ops.hpp>
 #include <migraphx/iterator_for.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace fpga {
 std::string target::name() const { return "fpga"; }
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
 {
    // not sure if all these passes are needed but they were copied from ref/
    auto& ctx = any_cast<context>(gctx);
    return {normalize_ops{},
            eliminate_pad{},
            dead_code_elimination{},
            insert_pad{},
            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
            auto_contiguous{},
            dead_code_elimination{},
            subgraph{},
            dead_code_elimination{},
            lowering{&ctx},
            dead_code_elimination{}};
 }
 argument target::allocate(const shape& s) const { return fill_argument(s, 0); }
 supported_segments target::find_supported(const_module_ref mod, support_metric m) const
 {
    (void)m;
    supported_segment instrs;
    for(const auto ins : iterator_for(*mod))
    {
        instrs.instructions.insert(ins);
    }
    instrs.metric = 1; // arbitrary value
    return {instrs};
 }
 MIGRAPHX_REGISTER_TARGET(target);
 } // namespace fpga
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/fpga/vitis_ai_adapter.cpp
+++ b/docker/rocm/migraphx/targets/fpga/vitis_ai_adapter.cpp
@ -0,0 +1,65 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include "migraphx/fpga/vitis_ai_adapter.hpp"
 #include "migraphx/module.hpp"
 #include "migraphx/stringutils.hpp"
 namespace vitis_ai {
 migraphx::shape x_model::get_shape() const { return shape; };
 void x_model::set_shape(migraphx::shape s) { shape = s; }
 x_model create_xmodel(migraphx::const_module_ref mod)
 {
    std::cout << "Calling an external function: create_xmodel!\n";
    x_model xmodel;
    xmodel.set_shape(migraphx::shape(mod->get_output_shapes()));
    return xmodel;
 }
 migraphx::argument execute(const x_model& xmodel,
                           const migraphx::shape& output_shape,
                           std::vector<migraphx::argument>& args)
 {
    (void)xmodel;
    std::cout << "Calling an external function: execute!\n";
    std::cout << "Output Shape: " << output_shape << std::endl;
    std::cout << "Args: " << args.size() << std::endl;
    for(const auto& arg : args)
    {
        std::cout << "  " << arg.get_shape() << std::endl;
    }
    std::cout << std::endl;
    migraphx::argument result{output_shape};
    return result;
 }
 } // namespace vitis_ai
--- a/docker/rocm/migraphx/targets/gpu/CMakeLists.txt
+++ b/docker/rocm/migraphx/targets/gpu/CMakeLists.txt
@ -0,0 +1,407 @@
 # ####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 # ####################################################################################
 find_package(hip REQUIRED)
 if(NOT GPU_TARGETS)
    set(fatal_msg "HIP package is broken and has no GPU_TARGETS. Please pass GPU_TARGETS to cmake.")
    if(NOT WIN32)
        set(fatal_msg "${fatal_msg}\nUse -DGPU_TARGETS=$(/opt/rocm/bin/rocminfo | grep -o -m1 'gfx.*') to build for your GPU.")
    endif()
    message(FATAL_ERROR ${fatal_msg})
 endif()
 if(MIGRAPHX_USE_MIOPEN)
    find_package(miopen REQUIRED)
    message(STATUS "MIGraphX is using MIOpen")
 else()
    message(STATUS "MIGraphX is not using MIOpen")
 endif()
 if(MIGRAPHX_USE_ROCBLAS)
    # rocblas
    find_package(rocblas REQUIRED)
    message(STATUS "MIGraphX build with rocBLAS")
 else()
    message(STATUS "MIGraphX build without rocBLAS")
 endif()
 if(MIGRAPHX_USE_HIPBLASLT)
    # hipblaslt
    find_package(hipblaslt REQUIRED)
    # Making hipblas required to workaround the broken hipblaslt package.
    find_package(hipblas REQUIRED)
    message(STATUS "MIGraphx build with hipBLAS and hipBLASLt")
 else()
    message(STATUS "MIGraphX build without hipBLAS and hipBLASLt")
 endif()
 if(MIGRAPHX_USE_COMPOSABLEKERNEL)
    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
 endif()
 if(BUILD_DEV)
    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
 else()
    set(MIGRAPHX_USE_HIPRTC ON CACHE BOOL "Use hipRTC APIs")
 endif()
 file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
    list(REMOVE_ITEM KERNEL_FILES
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
 endif()
 include(Embed)
 add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)
 configure_file(device/targets.hpp.in include/migraphx/gpu/device/targets.hpp)
 file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})
 add_library(compile_for_gpu INTERFACE)
 target_compile_features(compile_for_gpu INTERFACE cxx_std_17)
 target_compile_options(compile_for_gpu INTERFACE -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fnative-half-arguments-and-returns)
 target_link_options(compile_for_gpu INTERFACE  -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
 target_link_libraries(compile_for_gpu INTERFACE hip::device)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
    message(STATUS "Enable -fhip-lambda-host-device")
    target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
 endif()
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_device)
 target_link_libraries(migraphx_device PUBLIC migraphx)
 target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 if(NOT MIGRAPHX_USE_MIOPEN AND NOT MIGRAPHX_USE_ROCBLAS)
    target_link_libraries(migraphx_device INTERFACE hip::host)
 endif()
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINAR_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
 target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
 migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)
 add_library(kernel_file_check EXCLUDE_FROM_ALL)
 foreach(KERNEL_FILE ${KERNEL_FILES})
    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
 endforeach()
 target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
 target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_WAVEFRONTSIZE=64)
 target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
 target_link_libraries(kernel_file_check compile_for_gpu)
 if(MIGRAPHX_USE_COMPOSABLEKERNEL)
    target_link_libraries(kernel_file_check composable_kernel::jit_library)
 endif()
 rocm_clang_tidy_check(kernel_file_check)
 file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
 if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
    list(REMOVE_ITEM JIT_GPU_SRCS
            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp
            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm_softmax_gemm.cpp)
 endif()
 if(MIGRAPHX_USE_MIOPEN)
    set(MIOPEN_SRCS abs.cpp)
 endif()
 add_library(migraphx_gpu
    analyze_streams.cpp
    allocation_model.cpp
    argmax.cpp
    argmin.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compile_hipblaslt.cpp
    compile_miopen.cpp
    compile_pointwise.cpp
    compiler.cpp
    device_name.cpp
    fuse_ck.cpp
    fuse_mlir.cpp
    fuse_ops.cpp
    gemm_impl.cpp
    hip.cpp
    hipblaslt.cpp
    hip_gemm_impl.cpp
    kernel.cpp
    lowering.cpp
    logsoftmax.cpp
    loop.cpp
    lrn.cpp
    mlir.cpp
    multinomial.cpp
    no_device.cpp
    nonzero.cpp
    pack_args.cpp
    prefuse_ops.cpp
    prepare_reduce.cpp
    perfdb.cpp
    pooling.cpp
    problem_cache.cpp
    reverse.cpp
    rnn_variable_seq_lens.cpp
    rocblas.cpp
    schedule_model.cpp
    sync_device.cpp
    target.cpp
    time_op.cpp
    topk.cpp
    write_literals.cpp
    ${JIT_GPU_SRCS}
    ${MIOPEN_SRCS}
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 migraphx_generate_export_header(migraphx_gpu)
 function(register_migraphx_gpu_ops PREFIX)
    foreach(OP ${ARGN})
        register_op(migraphx_gpu HEADER migraphx/gpu/${OP}.hpp OPERATORS gpu::${PREFIX}${OP} INCLUDES migraphx/gpu/context.hpp)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
    argmax
    argmin
    logsoftmax
    loop
    multinomial
    nonzero
    prefix_scan_sum
    reverse
    topk
 )
 if (MIGRAPHX_USE_MIOPEN)
 register_migraphx_gpu_ops(miopen_
    abs
    contiguous
    lrn
    pooling
 )
 else()
 register_migraphx_gpu_ops(miopen_
    contiguous
 )
 endif()
 register_op(migraphx_gpu
    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
    OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
    INCLUDES migraphx/gpu/context.hpp)
 if(MIGRAPHX_USE_ROCBLAS)
    register_op(migraphx_gpu
        HEADER migraphx/gpu/gemm.hpp
        OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
        INCLUDES migraphx/gpu/context.hpp)
 endif()
 if(MIGRAPHX_USE_HIPBLASLT)
    register_op(migraphx_gpu
        HEADER migraphx/gpu/hip_gemm.hpp
        OPERATORS gpu::hip_gemm<op::dot> gpu::hip_gemm<op::quant_dot>
        INCLUDES migraphx/gpu/context.hpp)
 endif()
 if (MIGRAPHX_USE_MIOPEN)
    register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
        OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::convolution_backwards> gpu::miopen_convolution<op::quant_convolution>
        INCLUDES migraphx/gpu/context.hpp)
 endif()
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)
 set(MIGRAPHX_ENABLE_MLIR ON CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
    # Find package rocMLIR
    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
    # Make this private to avoid multiple inclusions of LLVM symbols.
    # TODO: Fix rocMLIR's library to hide LLVM internals.
    target_link_libraries(migraphx_gpu PRIVATE rocMLIR::rockCompiler)
 endif()
 if(MIGRAPHX_USE_HIPRTC)
    find_package(hiprtc REQUIRED)
    message(STATUS "MIGraphX is using hipRTC")
    target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
    target_link_libraries(migraphx_gpu PUBLIC hiprtc::hiprtc)
 else()
    message(STATUS "MIGraphX is using HIP Clang")
    # Get flags needed to compile hip
    include(TargetFlags)
    target_flags(HIP_COMPILER_FLAGS hip::device)
    # Remove cuda arch flags
    string(REGEX REPLACE "--cuda-gpu-arch=[a-z0-9]+ ?" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
    string(REGEX REPLACE "--offload-arch=[a-z0-9:+-]+ ?" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
    # Skip library paths since hip will incorrectly treat it as a source file
    string(APPEND HIP_COMPILER_FLAGS " ")
    if(WIN32)
        string(REPLACE "\\" "/" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
    endif()
    foreach(_unused RANGE 2)
        string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
    endforeach()
    message(STATUS "Hip compiler flags: \"${HIP_COMPILER_FLAGS}\"")
    target_compile_definitions(migraphx_gpu PRIVATE
        -DMIGRAPHX_HIP_COMPILER="${CMAKE_CXX_COMPILER}"
        -DMIGRAPHX_HIP_COMPILER_FLAGS="${HIP_COMPILER_FLAGS}"
    )
    if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
        if(WIN32)
            execute_process(COMMAND where ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
        else()
            execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
        endif()
        string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
        target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER="${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
    endif()
 endif()
 target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_CXX_COMPILER="${CMAKE_CXX_COMPILER}")
 # Check miopen find mode api
 include(CheckLibraryExists)
 if (MIGRAPHX_USE_MIOPEN)
    get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_MIOPEN=1)
    check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
    check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
 else()
 target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_MIOPEN=0)
 endif()
 if(MIGRAPHX_USE_ROCBLAS)
    get_target_property(ROCBLAS_LOCATION roc::rocblas LOCATION)
    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_ROCBLAS=1)
    # Beta API for automated GEMM tuning
    check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
    # rocblas FP8 API
    check_library_exists(roc::rocblas "rocblas_gemm_strided_batched_ex3" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_FP8_BETA_API)
 else()
    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_ROCBLAS=0)
 endif()
 if(MIGRAPHX_USE_HIPBLASLT)
    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_HIPBLASLT=1)
 else()
    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_HIPBLASLT=0)
 endif()
 if(MIGRAPHX_USE_MIOPEN)
    set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")
    if(MIGRAPHX_USE_FIND_2_API)
        check_library_exists(MIOpen "miopenSetFindOptionPreallocatedTensor" "${MIOPEN_LOCATION}" HAS_PREALLOCATION_API)
        if(HAS_PREALLOCATION_API)
            target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API -DMIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS)
        else()
            target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
        endif()
        message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
    else()
        message(STATUS "MIGraphx is using legacy Find API in MIOpen")
    endif()
    if(HAS_FIND_MODE_API)
        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
        message(STATUS "MIGraphx is using Find Mode API of MIOpen")
    else()
        message(STATUS "MIOpen does not have find mode api")
    endif()
    target_link_libraries(migraphx_gpu PUBLIC MIOpen)
 endif()
 if(MIGRAPHX_USE_ROCBLAS)
    if(HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
        message(STATUS "MIGraphx is using Beta API of rocBLAS")
    else()
        message(STATUS "rocBLAS does not have User Tuning Beta API")
    endif()
    if(HAS_ROCBLAS_FP8_BETA_API)
        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_FP8_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
        message(STATUS "MIGraphX is using Beta API of rocBLAS for FP8 computations")
    else()
        message(STATUS "rocBLAS does not have Fp8 Beta API")
    endif()
    target_link_libraries(migraphx_gpu PUBLIC roc::rocblas)
 endif()
 if(MIGRAPHX_USE_HIPBLASLT)
    target_link_libraries(migraphx_gpu PUBLIC roc::hipblaslt)
 endif()
 if(WIN32)
    # Temporary workaround on rocMLIR not exporting correctly libraries it depends on.
    target_link_libraries(migraphx_gpu PRIVATE ntdll)
 endif()
 target_link_libraries(migraphx_gpu PUBLIC migraphx)
 if(NOT MIGRAPHX_USE_MIOPEN AND NOT MIGRAPHX_USE_ROCBLAS)
    target_link_libraries(migraphx_gpu PUBLIC migraphx_device)
 else()
    target_link_libraries(migraphx_gpu PRIVATE migraphx_device)
 endif()
 target_link_libraries(migraphx_gpu PRIVATE migraphx_kernels)
 if(MIGRAPHX_USE_COMPOSABLEKERNEL)
    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
    target_compile_definitions(migraphx_gpu PRIVATE MIGRAPHX_USE_COMPOSABLEKERNEL=1)
 endif()
 add_subdirectory(driver)
 add_subdirectory(hiprtc)
 rocm_install_targets(
    PRIVATE
    TARGETS migraphx_gpu migraphx_device compile_for_gpu
    INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
--- a/docker/rocm/migraphx/targets/gpu/abs.cpp
+++ b/docker/rocm/migraphx/targets/gpu/abs.cpp
@ -0,0 +1,61 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 #if MIGRAPHX_USE_MIOPEN
 shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
 {
    check_shapes{inputs, *this}.has(2).packed();
    return inputs.at(0);
 }
 argument miopen_abs::compute(context& ctx,
                             const shape& output_shape,
                             const std::vector<argument>& args) const
 {
    float alpha = 1;
    float beta  = 0;
    auto x_desc = make_tensor(args[0].get_shape());
    auto y_desc = make_tensor(output_shape);
    miopenActivationForward(ctx.get_stream().get_miopen(),
                            ad.get(),
                            &alpha,
                            x_desc.get(),
                            args[0].implicit(),
                            &beta,
                            y_desc.get(),
                            args[1].implicit());
    return args[1];
 }
 void miopen_abs::finalize(context&, const shape&, const std::vector<shape>&) { ad = make_abs(); }
 #endif
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/allocation_model.cpp
+++ b/docker/rocm/migraphx/targets/gpu/allocation_model.cpp
@ -0,0 +1,48 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/module.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 std::string gpu_allocation_model::name() const { return "hip::allocate"; }
 operation gpu_allocation_model::allocate(const shape& s) const
 {
    return make_op(name(), {{"shape", to_value(s)}});
 }
 operation gpu_allocation_model::preallocate(const shape& s, const std::string& id) const
 {
    return make_op("hip::hip_allocate_memory", {{"shape", to_value(s)}, {"id", id}});
 }
 std::string gpu_allocation_model::copy() const { return "hip::copy"; }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/analyze_streams.cpp
+++ b/docker/rocm/migraphx/targets/gpu/analyze_streams.cpp
@ -0,0 +1,82 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/analyze_streams.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/value.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 struct hip_stream_model
 {
    std::size_t max_stream = 0;
    std::unordered_map<migraphx::instruction_ref, std::size_t> ins2stream{};
    std::size_t get_nstream() const { return max_stream + 1; }
    std::size_t get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
    std::size_t get_event_id(migraphx::instruction_ref ins) const
    {
        auto v = ins->get_operator().to_value();
        return v["event"].to<std::size_t>();
    }
    bool has_stream(migraphx::instruction_ref ins) const { return ins2stream.count(ins) > 0; }
    bool is_record(migraphx::instruction_ref ins) const
    {
        return ins->name() == "gpu::record_event";
    }
    bool is_wait(migraphx::instruction_ref ins) const { return ins->name() == "gpu::wait_event"; }
 };
 stream_model make_stream_model(const module& m)
 {
    hip_stream_model hsm;
    std::size_t stream = 0;
    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "gpu::set_stream")
        {
            auto v         = ins->get_operator().to_value();
            stream         = v["stream"].to<std::size_t>();
            hsm.max_stream = std::max(stream, hsm.max_stream);
        }
        if(ins->get_operator().is_context_free())
            continue;
        if(contains({"hip::hip_allocate_memory", "hip::hip_copy_literal", "@param"}, ins->name()))
            continue;
        hsm.ins2stream[ins] = stream;
    }
    return hsm;
 }
 std::vector<stream_race> analyze_streams(const module& m)
 {
    return migraphx::analyze_streams(m, make_stream_model(m));
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/argmax.cpp
+++ b/docker/rocm/migraphx/targets/gpu/argmax.cpp
@ -0,0 +1,50 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/argmax.hpp>
 #include <migraphx/gpu/device/argmax.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/tune_axis.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
 {
    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }
 argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
    auto n_dim         = args.front().get_shape().lens().size();
    int64_t tuned_axis = tune_axis(n_dim, op.axis, op.name());
    device::argmax(
        ctx.get_stream().get(), args.back(), args.front(), tuned_axis, op.select_last_index);
    return args.back();
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/argmin.cpp
+++ b/docker/rocm/migraphx/targets/gpu/argmin.cpp
@ -0,0 +1,50 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/argmin.hpp>
 #include <migraphx/gpu/device/argmin.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/tune_axis.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
 {
    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }
 argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
    auto n_dim         = args.front().get_shape().lens().size();
    int64_t tuned_axis = tune_axis(n_dim, op.axis, op.name());
    device::argmin(
        ctx.get_stream().get(), args.back(), args.front(), tuned_axis, op.select_last_index);
    return args.back();
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/code_object_op.cpp
+++ b/docker/rocm/migraphx/targets/gpu/code_object_op.cpp
@ -0,0 +1,67 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/register_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 MIGRAPHX_REGISTER_OP(code_object_op);
 shape code_object_op::compute_shape(std::vector<shape> inputs) const
 {
    std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](const shape& s) {
        return s.normalize_standard();
    });
    auto einputs = expected_inputs;
    std::transform(einputs.begin(), einputs.end(), einputs.begin(), [](const shape& s) {
        return s.normalize_standard();
    });
    if(not migraphx::equal(flatten(einputs), flatten(inputs), &shape::is_compatible))
        MIGRAPHX_THROW("Input shapes have changed: [" + to_string_range(einputs) + "] -> [" +
                       to_string_range(inputs) + "]");
    return output;
 }
 argument
 code_object_op::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
    auto fargs = flatten(args);
    std::vector<void*> kargs(fargs.size());
    std::transform(
        fargs.begin(), fargs.end(), kargs.begin(), [](const argument& a) { return a.data(); });
    auto [start, stop] = ctx.get_perf_events();
    k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
 {
    assert(not code_object.empty());
    k = kernel(code_object, symbol_name);
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_gen.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_gen.cpp
@ -0,0 +1,576 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_gen.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/prepare_reduce.hpp>
 #include <migraphx/algorithm.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/optimize_module.hpp>
 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/array.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/fp8_types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace gen {
 static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
 {
    // If all inputs are half then only use half2
    if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
           return s.type() == shape::half_type;
       }))
        return {2};
    return {4, 2};
 }
 vectorize vectorize::elements(std::size_t axis,
                              const std::vector<shape>& inputs,
                              const std::vector<std::size_t>& sizes)
 {
    // disable vectorization for fp8 types
    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
           return contains(fp8_types{}.get(), ishape.type());
       }))
        return {1, axis};
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),
                   inputs.end(),
                   std::back_inserter(max_vec_size),
                   [&](const auto& input) -> std::size_t {
                       auto stride = input.strides()[axis];
                       auto len    = input.lens()[axis];
                       if(not contains({0, 1}, stride))
                           return 1;
                       if(len == 1 and input.elements() > sizes.front())
                           return sizes.front();
                       auto it = std::find_if(sizes.begin(), sizes.end(), [&](auto vsize) {
                           // The len is divisible by the size and all the strides are divisible by
                           // the size
                           return (len % vsize) == 0 and
                                  std::all_of(
                                      input.strides().begin(), input.strides().end(), [&](auto i) {
                                          return contains({0, 1}, i) or i % vsize == 0;
                                      });
                       });
                       if(it != sizes.end())
                           return *it;
                       return 1;
                   });
    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
 }
 vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
 {
    // disable vectorization for fp8 types
    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
           return contains(fp8_types{}.get(), ishape.type());
       }))
        return {1, axis};
    if(inputs.empty())
        return {1, axis};
    std::size_t n = std::max_element(inputs.begin(),
                                     inputs.end(),
                                     by(std::less<>{}, [](const auto& s) { return s.elements(); }))
                        ->elements();
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    std::size_t over = n / max_global;
    bool broadcasted =
        std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); });
    std::vector<std::size_t> sizes;
    if(broadcasted and over > 8)
        sizes.push_back(8);
    if(over > 4)
        sizes.push_back(4);
    sizes.push_back(2);
    return elements(axis, inputs, sizes);
 }
 vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
 {
    return elements(axis, inputs, vector_sizes(inputs));
 }
 std::string vectorize::str() const
 {
    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
 }
 preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
 {
    const std::size_t max_lds_bytes = 4096;
    std::vector<bool> result(inputs.size());
    std::vector<std::size_t> preloaded;
    auto idxs = range(inputs.size());
    std::copy_if(idxs.begin(), idxs.end(), std::back_inserter(preloaded), [&](auto i) {
        return inputs[i].strides()[axis] == 0;
    });
    std::sort(preloaded.begin(), preloaded.end(), by(std::less<>{}, [&](auto i) {
                  return inputs[i].bytes();
              }));
    std::size_t bytes = 0;
    for(auto i : preloaded)
    {
        const auto& input = inputs[i];
        bytes += input.bytes();
        if(bytes > max_lds_bytes)
            break;
        result[i] = true;
    }
    return {result};
 }
 std::string preload::str() const
 {
    std::vector<std::string> bool_strs;
    std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
        if(b)
            return "true";
        return "false";
    });
    return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
 }
 bool preload::is_preloading() const
 {
    return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
 }
 static std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
 {
    return (x + y - std::size_t{1}) / y;
 }
 static std::size_t compute_tile_factor(std::size_t r, std::size_t max_size = 64)
 {
    std::size_t n = 1;
    auto factors  = make_array(2, 3, 5, 7, 11);
    while(n < max_size)
    {
        // NOLINTNEXTLINE(readability-qualified-auto)
        auto it = std::find_if(factors.begin(), factors.end(), [&](auto d) { return r % d == 0; });
        if(it == factors.end())
            break;
        r /= *it;
        n *= *it;
    }
    return n;
 }
 tile tile::elements(const std::vector<shape>& inputs, std::size_t noutputs)
 {
    tile result;
    auto ndim = inputs.front().ndim();
    std::vector<std::size_t> faxes;
    std::transform(
        inputs.begin(), inputs.end(), std::back_inserter(faxes), MIGRAPHX_LIFT(find_fast_axis));
    result.axis = std::accumulate(faxes.begin(), faxes.end(), ndim, MIGRAPHX_LIFT(std::min));
    if(result.axis >= (ndim - 1))
        return {};
    auto select = [&](auto m) {
        return [&, m](std::size_t faxis, shape input) {
            if(input.broadcasted())
                return none;
            if(faxis < (ndim - 1))
                return m;
            return none;
        };
    };
    std::transform(faxes.begin(),
                   faxes.end() - noutputs,
                   inputs.begin(),
                   std::back_inserter(result.args),
                   select(load));
    std::transform(faxes.end() - noutputs,
                   faxes.end(),
                   inputs.end() - noutputs,
                   std::back_inserter(result.args),
                   select(store));
    auto nargs = std::count_if(
        result.args.begin(), result.args.end(), [](auto m) { return m != mode::none; });
    // TODO: Handle tiling more than one arguments
    if(nargs != 1)
        return {};
    const auto& s = inputs.front();
    auto dim1     = compute_tile_factor(s.lens()[result.axis]);
    auto dim2     = compute_tile_factor(s.lens().back(), 4096 / dim1);
    if(dim1 == 1 or dim2 == 1)
        return {};
    result.inner = s.lens();
    std::fill(result.inner.begin(), result.inner.end(), 1);
    result.inner[result.axis] = dim1;
    result.inner.back()       = dim2;
    result.outer = s.lens();
    result.outer[result.axis] /= dim1;
    result.outer.back() /= dim2;
    auto tile_size = dim1 * dim2;
    result.ntiles  = s.elements() / tile_size;
    // equivalent to dim1 * (dim2 + 1) to avoid bank conflicts
    auto tile_bytes = (tile_size + dim1) * s.type_size();
    if(tile_bytes > 65536)
        return {};
    result.block_size = std::min<std::size_t>(256, integer_divide_ceil(tile_size / 4, 64) * 64);
    return result;
 }
 std::string tile::str() const
 {
    if(args.empty())
        return "transform_args()";
    std::vector<std::string> strs;
    std::transform(args.begin(), args.end(), std::back_inserter(strs), [](mode m) {
        switch(m)
        {
        case load: return "tile::load";
        case store: return "tile::store";
        case none: return "tile::none";
        }
        MIGRAPHX_THROW("Invalid mode");
    });
    const std::string auto_tile = "auto_tile<${modes}>(${inner}, ${outer})";
    return interpolate_string(auto_tile,
                              {{"modes", join_strings(strs, ", ")},
                               {"inner", generate_index_ints(inner)},
                               {"outer", generate_index_ints(outer)}});
 }
 std::size_t find_fast_axis(const shape& input)
 {
    if(input.scalar())
        return input.ndim() - 1;
    if(input.broadcasted())
    {
        auto stride_it = std::min_element(
            input.strides().begin(), input.strides().end(), by(std::less<>{}, [](std::size_t i) {
                if(i == 0)
                    return std::numeric_limits<std::size_t>::max();
                return i;
            }));
        return stride_it - input.strides().begin();
    }
    auto permutation = invert_permutation(find_permutation(input));
    auto it          = std::max_element(permutation.begin(), permutation.end());
    return it - permutation.begin();
 }
 std::size_t find_fast_axis(const std::vector<shape>& inputs)
 {
    auto permutation = invert_permutation(find_permutation(inputs));
    auto it          = std::max_element(permutation.begin(), permutation.end());
    return it - permutation.begin();
 }
 std::string make_transformer_args(std::vector<std::string> transformers)
 {
    return join_strings(std::move(transformers), ", ");
 }
 static void generate_pointwise(cpp_generator& gg,
                               const module& pm,
                               const std::string& name,
                               bool always_return_tuple = false)
 {
    module m = pm;
    run_passes(m, {rewrite_quantization{}, optimize_module{}});
    m.sort();
    cpp_generator g;
    g.always_return_tuple(always_return_tuple);
    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
    g.add_point_op("not", "migraphx::abs(not ${0})");
    // Add explict conversions
    g.fresult(
        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
    gg.create_function(g.generate_module(m)
                           .set_attributes({"__device__", "__attribute__((const))"})
                           .set_generic_types(m)
                           .set_name(name));
 }
 std::string generate_pointwise(const module& pm, const std::string& name, bool always_return_tuple)
 {
    cpp_generator g;
    generate_pointwise(g, pm, name, always_return_tuple);
    return g.str();
 }
 std::string reduce_op::str() const
 {
    return write + "(r.reduce(" + reduction + ", " + init + ", " + read + ")(" +
           join_strings(inputs, ", ") + "))";
 }
 void reduce_op::set(const std::string& name, const shape& input, const shape& output)
 {
    assert(input.type() != shape::tuple_type);
    assert(output.type() != shape::tuple_type);
    if(name == "reduce_sum")
    {
        reduction = "op::sum{}";
    }
    else if(name == "reduce_mean")
    {
        auto reduce_elements = input.elements() / output.elements();
        auto reduce_type     = input.type();
        reduction            = "op::sum{}";
        std::string mean     = "op::mean<" + std::to_string(reduce_elements) + ">{}";
        // Use float accumulator when reduction size is too large for half
        if(reduce_type == shape::half_type and reduce_elements > 16384)
            read = "compose(" + mean + ", op::convert_to<float>{})";
        else if(contains({shape::float_type, shape::half_type, shape::double_type}, reduce_type))
            read = mean;
        else
            write = mean;
    }
    else if(name == "reduce_max")
    {
        reduction = "op::max{}";
        init      = "lowest{}";
    }
    else if(name == "reduce_min")
    {
        reduction = "op::min{}";
        init      = "highest{}";
    }
    else if(name == "reduce_prod")
    {
        reduction = "op::product{}";
        init      = "1";
    }
    else if(name == "reduce_any")
    {
        reduction = "op::logical_or{}";
        init      = "bool{false}";
    }
    else if(name == "reduce_all")
    {
        reduction = "op::logical_and{}";
        init      = "bool{true}";
    }
    else
    {
        MIGRAPHX_THROW("Unsupported reduce");
    }
 }
 void reduce_op::set(instruction_ref ins, const operation& op)
 {
    if(op.name() == "gpu::parallel_reduce")
    {
        auto rop    = from_value<operation>(op.to_value().at("op"));
        auto input  = ins->inputs().front()->get_shape();
        auto output = ins->get_shape().sub_shapes().front();
        set(rop.name(), input, output);
        read = "compose(array_apply(" + read + "), MIGRAPHX_LIFT(make_array))";
    }
    else
    {
        set(op.name(), ins->inputs().front()->get_shape(), ins->get_shape());
    }
 }
 std::string reduce_op::generate(instruction_ref ins, const std::vector<std::string>& x)
 {
    reduce_op r{x};
    r.set(ins, ins->get_operator());
    return r.str();
 }
 static bool use_lazy_inner(instruction_ref ins)
 {
    if(ins->outputs().size() != 1)
        return false;
    // When the inputs are broadcasted, it means the lambda will capture SGPRs
    // when doing block/wave reduction. This can cause register spilling in
    // the compiler when the lambda is evaluated at a later time although it
    // shouldn't. Instead, use `inner` to workaround this issue in the
    // compiler.
    if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](instruction_ref input) {
           return input->get_shape().broadcasted();
       }))
        return false;
    auto output = ins->outputs().front();
    return contains(output->name(), "reduce") or output->name() == "@return";
 }
 void preload_params(module& m)
 {
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "@param")
            continue;
        if(ins->outputs().size() <= 1)
            continue;
        auto id = m.insert_instruction(std::next(ins), make_op("identity"), ins);
        m.replace_instruction(ins, id);
    }
 }
 std::string generate_reduce(module m, const std::string& name)
 {
    preload_params(m);
    run_passes(m, {optimize_module{}, prepare_reduce{}, optimize_module{}});
    m.sort();
    cpp_generator g;
    g.always_return_tuple();
    auto param_shapes = m.get_parameter_shapes();
    auto max_shape =
        std::max_element(param_shapes.begin(),
                         param_shapes.end(),
                         by(std::less<>{}, [](const auto& p) { return p.second.elements(); }));
    auto ilens    = max_shape->second.lens();
    std::size_t i = 0;
    auto f        = g.generate_module(m, [&](instruction_ref ins, const auto& names) {
        if(contains(ins->name(), "reduce"))
        {
            return reduce_op::generate(ins, cpp_generator::to_args(ins->inputs(), names));
        }
        if(ins->name() == "pointwise")
        {
            auto pointwise_name = "pointwise" + std::to_string(i);
            i++;
            generate_pointwise(g, *ins->module_inputs().front(), pointwise_name);
            std::vector<instruction_ref> tensors;
            std::copy_if(ins->inputs().begin(),
                         ins->inputs().end(),
                         std::back_inserter(tensors),
                         [&](auto input) {
                             return input->get_shape().lens() == ilens and
                                    not input->get_shape().broadcasted();
                         });
            auto inner_names = names;
            for(auto input : ins->inputs())
            {
                if(input->name() != "@param")
                    continue;
                if(contains(tensors, input))
                    continue;
                inner_names[input] += "[out_idx]";
            }
            for(auto input : tensors)
                inner_names[input] += "_lambda_param";
            auto call_function =
                pointwise_name + "(" +
                join_strings(cpp_generator::to_args(ins->inputs(), inner_names), ", ") + ")";
            if(tensors.empty())
                return call_function;
            const std::string inner_template =
                "r.${inner}([=](${params}) { return ${call}; })(${args})";
            std::string inner_name = use_lazy_inner(ins) ? "lazy_inner" : "inner";
            auto args              = cpp_generator::to_args(tensors, names);
            auto params            = cpp_generator::to_args(tensors, inner_names);
            std::transform(
                params.begin(), params.end(), params.begin(), [](auto s) { return "auto " + s; });
            return interpolate_string(inner_template,
                                      {{"inner", inner_name},
                                       {"params", join_strings(params, ", ")},
                                       {"args", join_strings(args, ", ")},
                                       {"call", call_function}});
        }
        if(ins->name() == "multibroadcast")
        {
            return names.at(ins->inputs().front());
        }
        if(ins->name() == "get_tuple_elem")
        {
            const auto& x = names.at(ins->inputs().front());
            auto index    = ins->get_operator().to_value()["index"].to<std::size_t>();
            return interpolate_string("${x}[${index}]",
                                      {{"x", x}, {"index", std::to_string(index)}});
        }
        if(ins->name() == "identity")
        {
            const auto& x = names.at(ins->inputs().front());
            return "r.inner(op::id{})(" + x + ")";
        }
        MIGRAPHX_THROW("Unknown operator: " + ins->name());
    });
    f.set_attributes({"__device__", "__attribute__((const))"}).set_generic_types(m).set_name(name);
    f.add_generic_param("r");
    f.add_generic_param("out_idx");
    f.unused_param("out_idx");
    g.create_function(f);
    return g.str();
 }
 static std::vector<std::string> get_op_names(const module& m)
 {
    std::vector<std::string> result;
    for(auto& ins : m)
    {
        if(starts_with(ins.name(), "@"))
            continue;
        if(contains({"multibroadcast", "contiguous", "identity"}, ins.name()))
            continue;
        if(ins.name() == "pointwise")
        {
            auto names = get_op_names(*ins.module_inputs().front());
            result.insert(result.end(), names.begin(), names.end());
        }
        else
        {
            result.push_back(ins.name());
        }
    }
    return result;
 }
 std::string generate_name_from_ops(const module& m, const std::string& postname)
 {
    auto op_names = get_op_names(m);
    if(not postname.empty())
        op_names.push_back(postname);
    if(op_names.empty())
        return "noop";
    return join_strings(op_names, "_");
 }
 } // namespace gen
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_hip.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_hip.cpp
@ -0,0 +1,406 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/fileutils.hpp>
 #include <cassert>
 #include <iostream>
 #include <deque>
 #ifdef MIGRAPHX_USE_HIPRTC
 #include <hip/hiprtc.h>
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/tmp_dir.hpp>
 #include <migraphx/dynamic_loader.hpp>
 #include <migraphx/process.hpp>
 #include <migraphx/msgpack.hpp>
 #include <migraphx/serialize.hpp>
 #include <migraphx/file_buffer.hpp>
 #else
 #include <migraphx/compile_src.hpp>
 #include <migraphx/process.hpp>
 #endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG_SYM);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
 #ifdef MIGRAPHX_USE_HIPRTC
 std::string hiprtc_error(hiprtcResult err, const std::string& msg)
 {
    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
 }
 void hiprtc_check_error(hiprtcResult err, const std::string& msg, const std::string& ctx)
 {
    if(err != HIPRTC_SUCCESS)
        throw make_exception(ctx, hiprtc_error(err, msg));
 }
 // NOLINTNEXTLINE
 #define MIGRAPHX_HIPRTC(...) \
    hiprtc_check_error(__VA_ARGS__, #__VA_ARGS__, MIGRAPHX_MAKE_SOURCE_CTX())
 #define MIGRAPHX_HIPRTC_THROW(error, msg) MIGRAPHX_THROW(hiprtc_error(error, msg))
 // Workaround hiprtc's broken API
 void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
 using hiprtc_program_ptr = MIGRAPHX_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
 template <class... Ts>
 hiprtc_program_ptr hiprtc_program_create(Ts... xs)
 {
    hiprtcProgram prog = nullptr;
    auto result        = hiprtcCreateProgram(&prog, xs...);
    hiprtc_program_ptr p{prog};
    if(result != HIPRTC_SUCCESS)
        MIGRAPHX_HIPRTC_THROW(result, "Create program failed.");
    return p;
 }
 struct hiprtc_program
 {
    struct string_array
    {
        std::deque<std::string> strings{};
        std::vector<const char*> c_strs{};
        string_array() {}
        string_array(const string_array&) = delete;
        std::size_t size() const { return strings.size(); }
        const char** data() { return c_strs.data(); }
        void push_back(std::string s)
        {
            strings.push_back(std::move(s));
            c_strs.push_back(strings.back().c_str());
        }
    };
    hiprtc_program_ptr prog = nullptr;
    string_array headers{};
    string_array include_names{};
    std::string cpp_src  = "";
    std::string cpp_name = "";
    hiprtc_program(const std::string& src, const std::string& name = "main.cpp")
        : cpp_src(src), cpp_name(name)
    {
        create_program();
    }
    hiprtc_program(std::vector<hiprtc_src_file> srcs)
    {
        for(auto&& src : srcs)
        {
            if(ends_with(src.path, ".cpp"))
            {
                cpp_src  = std::move(src.content);
                cpp_name = std::move(src.path);
            }
            else
            {
                headers.push_back(std::move(src.content));
                include_names.push_back(std::move(src.path));
            }
        }
        create_program();
    }
    void create_program()
    {
        assert(not cpp_src.empty());
        assert(not cpp_name.empty());
        assert(headers.size() == include_names.size());
        prog = hiprtc_program_create(cpp_src.c_str(),
                                     cpp_name.c_str(),
                                     headers.size(),
                                     headers.data(),
                                     include_names.data());
    }
    void compile(const std::vector<std::string>& options, bool quiet = false) const
    {
        if(enabled(MIGRAPHX_TRACE_HIPRTC{}))
            std::cout << "hiprtc " << join_strings(options, " ") << " " << cpp_name << std::endl;
        std::vector<const char*> c_options;
        std::transform(options.begin(),
                       options.end(),
                       std::back_inserter(c_options),
                       [](const std::string& s) { return s.c_str(); });
        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
        auto prog_log = log();
        if(not prog_log.empty() and not quiet)
        {
            std::cerr << prog_log << std::endl;
        }
        if(result != HIPRTC_SUCCESS)
            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
    }
    std::string log() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
        if(n == 0)
            return {};
        std::string buffer(n, '\0');
        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
        assert(buffer.back() != 0);
        return buffer;
    }
    std::vector<char> get_code_obj() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
        std::vector<char> buffer(n);
        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
        return buffer;
    }
 };
 std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
                                                           const std::vector<std::string>& params,
                                                           const std::string& arch)
 {
    hiprtc_program prog(std::move(srcs));
    auto options = params;
    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
    if(enabled(MIGRAPHX_GPU_DEBUG{}))
        options.push_back("-DMIGRAPHX_DEBUG");
    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
           return starts_with(s, "--std=") or starts_with(s, "-std=");
       }))
        options.push_back("-std=c++17");
    options.push_back("-fno-gpu-rdc");
    options.push_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
    options.push_back("-Wno-cuda-compat");
    options.push_back("--offload-arch=" + arch);
    prog.compile(options);
    return {prog.get_code_obj()};
 }
 bool hip_has_flags(const std::vector<std::string>& flags)
 {
    hiprtc_program prog{" "};
    std::string src = " ";
    src_file input{"main.cpp", src};
    std::vector<src_file> srcs = {input};
    try
    {
        std::string arch = "gfx900";
        compile_hip_src(srcs, flags, arch);
        return true;
    }
    catch(...)
    {
        return false;
    }
 }
 std::vector<std::vector<char>> compile_hip_src(const std::vector<src_file>& srcs,
                                               const std::vector<std::string>& params,
                                               const std::string& arch)
 {
    std::vector<hiprtc_src_file> hsrcs{srcs.begin(), srcs.end()};
    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
    {
        for(const auto& src : srcs)
        {
            if(src.path.extension() != ".cpp")
                continue;
            std::cout << std::string(src.content) << std::endl;
        }
    }
    auto fname  = make_executable_filename("migraphx-hiprtc-driver");
    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
    auto driver = p.parent_path() / fname;
    bool found = fs::exists(driver);
    if(not found)
    {
        driver = p.parent_path().parent_path() / "bin" / fname;
        found  = fs::exists(driver);
    }
    if(found)
    {
        value v;
        v["srcs"]   = to_value(hsrcs);
        v["params"] = to_value(params);
        v["arch"]   = to_value(arch);
        tmp_dir td{};
        auto out = td.path / "output";
        process(driver, {quote_string(out.string())}).write([&](auto writer) {
            to_msgpack(v, writer);
        });
        if(fs::exists(out))
            return {read_buffer(out)};
    }
    return compile_hip_src_with_hiprtc(std::move(hsrcs), params, arch);
 }
 #else // MIGRAPHX_USE_HIPRTC
 std::vector<std::vector<char>>
 compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file>,    // NOLINT
                            const std::vector<std::string>&, // NOLINT
                            const std::string&)
 {
    MIGRAPHX_THROW("Not using hiprtc");
 }
 bool is_hip_clang_compiler()
 {
    static const auto result = fs::path{MIGRAPHX_HIP_COMPILER}.stem() == "clang++";
    return result;
 }
 #ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
 bool has_compiler_launcher()
 {
    static const auto result = fs::exists(MIGRAPHX_HIP_COMPILER_LAUNCHER);
    return result;
 }
 #endif
 src_compiler assemble(src_compiler compiler)
 {
    compiler.out_ext = ".S";
    std::replace(compiler.flags.begin(), compiler.flags.end(), "-c", "-S");
    return compiler;
 }
 std::vector<std::vector<char>> compile_hip_src(const std::vector<src_file>& srcs,
                                               const std::vector<std::string>& params,
                                               const std::string& arch)
 {
    assert(not srcs.empty());
    if(not is_hip_clang_compiler())
        MIGRAPHX_THROW("Unknown hip compiler: " MIGRAPHX_HIP_COMPILER);
    src_compiler compiler;
    compiler.flags    = params;
    compiler.compiler = MIGRAPHX_HIP_COMPILER;
 #ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
    if(has_compiler_launcher())
        compiler.launcher = MIGRAPHX_HIP_COMPILER_LAUNCHER;
 #endif
    if(std::none_of(params.begin(), params.end(), [](const std::string& s) {
           return starts_with(s, "--std=") or starts_with(s, "-std=");
       }))
        compiler.flags.emplace_back("--std=c++17");
    compiler.flags.emplace_back(" -fno-gpu-rdc");
    if(enabled(MIGRAPHX_GPU_DEBUG_SYM{}))
        compiler.flags.emplace_back("-g");
    compiler.flags.emplace_back("-c");
    compiler.flags.emplace_back("--offload-arch=" + arch);
    compiler.flags.emplace_back("--cuda-device-only");
    compiler.flags.emplace_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3") + " ");
    if(enabled(MIGRAPHX_GPU_DEBUG{}))
        compiler.flags.emplace_back("-DMIGRAPHX_DEBUG");
    compiler.flags.emplace_back("-Wno-unused-command-line-argument");
    compiler.flags.emplace_back("-Wno-cuda-compat");
    compiler.flags.emplace_back(MIGRAPHX_HIP_COMPILER_FLAGS);
    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
    {
        for(const auto& src : srcs)
        {
            if(src.path.extension() != ".cpp")
                continue;
            std::cout << std::string(src.content) << std::endl;
        }
    }
    if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
    {
        std::cout << assemble(compiler).compile(srcs).data() << std::endl;
    }
    return {compiler.compile(srcs)};
 }
 bool hip_has_flags(const std::vector<std::string>& flags)
 {
    src_compiler compiler;
    compiler.compiler = MIGRAPHX_HIP_COMPILER;
    compiler.flags    = flags;
    compiler.flags.emplace_back("-x hip");
    compiler.flags.emplace_back("-c");
    compiler.flags.emplace_back("--offload-arch=gfx900");
    compiler.flags.emplace_back("--cuda-device-only");
    std::string src;
    src_file input{"main.cpp", src};
    try
    {
        compiler.compile({input});
        return true;
    }
    catch(...)
    {
        return false;
    }
 }
 #endif // MIGRAPHX_USE_HIPRTC
 std::string enum_params(std::size_t count, std::string param)
 {
    std::vector<std::string> items(count);
    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
    return join_strings(items, ",");
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_hip_code_object.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_hip_code_object.cpp
@ -0,0 +1,215 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx_kernels.hpp>
 #include <migraphx/stringutils.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 std::string generate_make_shape(const shape& s)
 {
    return "make_shape(" + generate_index_ints(s.lens()) + ", " + generate_index_ints(s.strides()) +
           ")";
 }
 static const char* const make_tensor_template = R"__migraphx__(
 template<>
 struct make_tensor<${n}>
 {
    static __device__ auto apply(void* __restrict__ p)
    {
        return make_tensor_view(reinterpret_cast<${type}* __restrict__>(p), make_shape(${lens}, ${strides}));
    }
 };
 )__migraphx__";
 std::string generate_make_tensor(std::size_t n, const shape& s)
 {
    return interpolate_string(make_tensor_template,
                              {{"n", std::to_string(n)},
                               {"type", shape::cpp_type(s.type())},
                               {"lens", generate_index_ints(s.lens())},
                               {"strides", generate_index_ints(s.strides())}});
 }
 std::string generate_args_hpp(const std::vector<shape>& inputs)
 {
    std::string inner;
    for(std::size_t i = 0; i < inputs.size(); i++)
    {
        inner += generate_make_tensor(i, inputs[i]);
    }
    const std::string args_hpp = R"__migraphx__(
 #ifndef MIGRAPHX_GUARD_AUTO_ARGS_HPP
 #define MIGRAPHX_GUARD_AUTO_ARGS_HPP
 #include <migraphx/kernels/args.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/types.hpp>
 namespace migraphx {
 __content__
 } // namespace migraphx
 #endif
 )__migraphx__";
    return replace_string(args_hpp, "__content__", inner);
 }
 static std::vector<std::string> get_compiler_warnings()
 {
    std::vector<std::string> warnings = {
        "-Weverything",
        "-Wno-c++98-compat",
        "-Wno-c++98-compat-pedantic",
        "-Wno-conversion",
        "-Wno-double-promotion",
        "-Wno-exit-time-destructors",
        "-Wno-extra-semi",
        "-Wno-extra-semi-stmt",
        "-Wno-float-conversion",
        "-Wno-gnu-anonymous-struct",
        "-Wno-gnu-zero-variadic-macro-arguments",
        "-Wno-missing-prototypes",
        "-Wno-nested-anon-types",
        "-Wno-padded",
        "-Wno-shorten-64-to-32",
        "-Wno-sign-conversion",
        "-Wno-sign-compare",
        "-Wno-unused-command-line-argument",
        "-Wno-weak-vtables",
        "-Wno-c99-extensions",
    };
    if(hip_has_flags({"-Werror", "-Wunsafe-buffer-usage"}))
        warnings.push_back("-Wno-unsafe-buffer-usage");
    return warnings;
 }
 const std::vector<std::string>& compiler_warnings()
 {
    static std::vector<std::string> warnings = get_compiler_warnings();
    return warnings;
 }
 void hip_compile_options::set_launch_params(
    const value& v,
    const std::function<std::size_t(std::size_t local)>& compute_global,
    std::size_t default_local)
 {
    local = v.get("local", default_local);
    if(v.contains("global"))
        global = v.at("global").to<std::size_t>();
    else
        global = compute_global(local);
 }
 static bool hip_accept_non_uniform_wg()
 {
    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
    return non_uniform_wg;
 }
 std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over)
 {
    assert(over > 0);
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
        std::size_t num_elements = n;
        if(not hip_accept_non_uniform_wg())
        {
            num_elements = (1 + (n - 1) / local) * local;
        }
        std::size_t groups     = 1 + (num_elements - 1) / local;
        std::size_t max_blocks = max_global / local;
        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
        return std::min(nglobal, num_elements);
    };
 }
 std::size_t compute_block_size(context& ctx, std::size_t n, std::size_t max_block_size)
 {
    const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size();
    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
    return std::min(std::max(min_block_size, block_size), max_block_size);
 }
 operation
 compile_hip_code_object(context& ctx, const std::string& content, hip_compile_options options)
 {
    assert(options.global > 0);
    assert(options.local > 0);
    assert(not options.inputs.empty());
    assert(options.inputs.size() == options.virtual_inputs.size() or
           options.virtual_inputs.empty());
    std::vector<src_file> srcs = options.additional_src_files;
    static auto kernels{::migraphx_kernels()};
    std::transform(
        kernels.begin(),
        kernels.end(),
        std::back_inserter(srcs),
        [](const std::pair<std::string_view, std::string_view>& elem) { return src_file{elem}; });
    srcs.emplace_back("main.cpp", content);
    auto args_hpp =
        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
    srcs.emplace_back("args.hpp", args_hpp);
    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
        options.emplace_param("-fno-offload-uniform-block");
    else
        assert(options.global % options.local == 0);
    options.emplace_param("-DMIGRAPHX_NGLOBAL=" + std::to_string(options.global));
    options.emplace_param("-DMIGRAPHX_NLOCAL=" + std::to_string(options.local));
    options.emplace_param("-DMIGRAPHX_WAVEFRONTSIZE=" +
                          std::to_string(ctx.get_current_device().get_wavefront_size()));
    const auto& warnings = compiler_warnings();
    options.params.insert(options.params.end(), warnings.begin(), warnings.end());
    options.emplace_param("-ftemplate-backtrace-limit=0");
    options.emplace_param("-Werror");
    auto cos = compile_hip_src(srcs, options.params, get_device_name());
    if(cos.size() != 1)
        MIGRAPHX_THROW("No code object");
    return code_object_op{value::binary{cos.front()},
                          options.kernel_name,
                          options.global,
                          options.local,
                          options.inputs,
                          options.output,
                          options.output_arg};
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_hipblaslt.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_hipblaslt.cpp
@ -0,0 +1,78 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #if MIGRAPHX_USE_HIPBLASLT
 #include <migraphx/gpu/compile_hipblaslt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 static size_t compile(migraphx::context& ctx, operation& op, instruction_ref ins)
 {
    auto v = op.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
    return v.get<std::size_t>("workspace", 0);
 }
 void compile_hipblaslt::apply(module& m) const
 {
    assert(ctx);
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::hipblaslt_op")
            continue;
        auto op     = any_cast<hipblaslt_op>(ins->get_operator()).op;
        auto inputs = ins->inputs();
        std::size_t ws = hipblaslt_workspace_size;
        auto alloc = m.insert_instruction(
            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
        inputs.insert(std::prev(inputs.end()), alloc);
        m.replace_instruction(ins, op, inputs);
        // Calculate workspace size
        ws               = compile(*ctx, op, ins);
        auto alloc_after = m.insert_instruction(
            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
        // Replace the workspace size with actual worksapce size needed.
        auto it = std::find(inputs.begin(), inputs.end(), alloc);
        if(it != inputs.end())
        {
            *it = alloc_after; // Replace `alloc` with `alloc_after`
        }
        m.replace_instruction(ins, op, inputs);
    }
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_USE_HIPBLASLT
--- a/docker/rocm/migraphx/targets/gpu/compile_miopen.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_miopen.cpp
@ -0,0 +1,89 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/op/identity.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 struct miopen_op
 {
    operation op = op::identity{};
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.op, "op"));
    }
    std::string name() const { return "gpu::miopen_op"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
        inputs.push_back(inputs.back());
        return op.compute_shape(inputs);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 MIGRAPHX_REGISTER_OP(miopen_op);
 std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
 {
    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
    return v.get<std::size_t>("workspace", 0);
 }
 void compile_miopen::apply(module& m) const
 {
    assert(ctx);
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::miopen_op")
            continue;
        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
        std::size_t ws = 0;
        ws             = compile(op, ins);
        auto inputs    = ins->inputs();
        auto alloc     = m.insert_instruction(
            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
        inputs.insert(std::prev(inputs.end()), alloc);
        m.replace_instruction(ins, op, inputs);
    }
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_ops.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_ops.cpp
@ -0,0 +1,332 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/program.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/algorithm.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/time_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_BENCHMARKING);
 struct precompile_op
 {
    operation op                      = op::identity{};
    std::size_t additional_args       = 1;
    bool ignore_modules               = false;
    std::optional<shape> output_shape = nullopt;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.op, "op"),
                    f(self.additional_args, "additional_args"),
                    f(self.ignore_modules, "ignore_modules"),
                    f(self.output_shape, "output_shape"));
    }
    std::string name() const { return "gpu::precompile_op"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
        // Pop off additional args
        inputs.resize(inputs.size() - additional_args);
        if(output_shape.has_value())
            return output_shape.value();
        if(ignore_modules)
            return op.compute_shape(inputs);
        return op.compute_shape(inputs, mods);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
 MIGRAPHX_REGISTER_OP(precompile_op);
 struct compiled_result
 {
    compiler_replace replace;
    instruction_ref ins;
    friend std::ostream& operator<<(std::ostream& os, const compiled_result& cr)
    {
        cr.replace.trace(os, cr.ins);
        return os;
    }
 };
 struct compile_plan
 {
    context* ctx;
    operation preop;
    instruction_ref ins;
    optional<tuning_config> config                 = nullopt;
    std::vector<optional<compiled_result>> results = {};
    void update_config(bool exhaustive)
    {
        config = get_tuning_config(*ctx, ins, preop, exhaustive);
    }
    template <class Vector>
    void insert_compiles(Vector& compiles, const value& solution, std::size_t i)
    {
        compiles.emplace_back([=] {
            try
            {
                results[i] = compiled_result{compile(*ctx, ins, preop, solution), ins};
            }
            catch(const std::exception& e)
            {
                const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
                if(trace_level > 0)
                    std::cerr << "Exception in " + preop.name() + ": " + e.what() << std::endl;
                results[i] = nullopt;
            }
            catch(...)
            {
                results[i] = nullopt;
            }
        });
    }
    template <class Vector>
    void add_compiles(Vector& compiles)
    {
        if(config.has_value())
        {
            const auto& problem = config->problem;
            if(auto sol = ctx->get_problem_cache().get(preop.name(), problem))
            {
                auto solution = sol.value();
                // No solution yet until benchmarked so skip for now
                if(solution.is_null())
                    return;
                results.resize(1);
                insert_compiles(compiles, solution, 0);
            }
            else
            {
                ctx->get_problem_cache().mark(preop.name(), problem);
                const auto& solutions = config->solutions;
                if(solutions.empty())
                    MIGRAPHX_THROW("No solutions provided for " + preop.name() + " with " +
                                   to_string(problem));
                results.resize(solutions.size());
                for(auto i : range(solutions.size()))
                {
                    auto solution = solutions[i];
                    insert_compiles(compiles, solution, i);
                }
            }
        }
        else
        {
            results.resize(1);
            insert_compiles(compiles, value{}, 0);
        }
    }
    std::string problem_string() const
    {
        if(config)
            return to_string(config->problem);
        return "<no problem key>";
    }
    const compiled_result& benchmark() const
    {
        const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
        if(trace_level > 0 and not results.empty())
        {
            std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
                      << std::endl;
        }
        if(results.empty())
            MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
                           problem_string());
        if(results.size() == 1)
        {
            if(not results.front().has_value())
                MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
                               problem_string());
            return *results.front();
        }
        if(not config)
            MIGRAPHX_THROW("Multiple kernels without config for " + preop.name());
        if(trace_level > 1)
            std::cout << "Problem: " << config->problem << std::endl;
        std::vector<double> times;
        times.reserve(results.size());
        std::transform(results.begin(),
                       results.end(),
                       config->solutions.begin(),
                       std::back_inserter(times),
                       [&](const auto& cr, const auto& solution) {
                           if(trace_level > 1)
                               std::cout << "Benchmarking solution: " << solution << std::endl;
                           if(not cr.has_value())
                           {
                               if(trace_level > 1)
                                   std::cout << "No binary" << std::endl;
                               return std::numeric_limits<double>::max();
                           }
                           if(trace_level > 2)
                               std::cout << *cr << std::endl;
                           /*
                           create a small program with insturction being compiled and call "replace"
                           on that which would insert all the compiled code objects, prefills etc.
                           necessary to run candidate code object
                           */
                           program bench_prog;
                           auto* bench_mm = bench_prog.get_main_module();
                           std::vector<instruction_ref> bench_ins_inputs;
                           std::transform(cr->ins->inputs().begin(),
                                          cr->ins->inputs().end(),
                                          std::back_inserter(bench_ins_inputs),
                                          [&](const auto& arg) {
                                              return bench_mm->add_parameter(
                                                  std::to_string(bench_ins_inputs.size()),
                                                  arg->get_shape());
                                          });
                           auto bench_ins = bench_mm->add_instruction(
                               cr->ins->get_operator(), bench_ins_inputs, cr->ins->module_inputs());
                           cr->replace.replace(*bench_mm, bench_ins);
                           // do dead code elimination by directly removing instruction
                           bench_mm->remove_instruction(bench_ins);
                           auto t = time_program(*ctx, bench_prog, 20);
                           if(trace_level > 1)
                               std::cout << t << "ms" << std::endl;
                           return t;
                       });
        std::this_thread::sleep_for(std::chrono::milliseconds{50});
        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
        if(trace_level > 0)
            std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
        ctx->get_problem_cache().insert(preop.name(), config->problem, config->solutions.at(i));
        if(not results[i].has_value())
            MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
                           problem_string());
        auto skipped = std::count_if(
            results.begin(), results.end(), [](const auto& cr) { return not cr.has_value(); });
        if(skipped > 0)
            std::cout << "Skipped " << skipped << " configs for " << preop.name() << std::endl;
        return *results[i];
    }
    void replace(module& m) const
    {
        const auto& cr = benchmark();
        cr.replace.replace(m, cr.ins);
    }
 };
 template <class F>
 void par_compile(std::size_t n, F f)
 {
    if(n == 0)
        return;
    auto d = value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{});
    if(d == 0)
        d = n;
    par_for(n, n / d, f);
 }
 struct compile_manager
 {
    std::vector<compile_plan> cps;
    bool exhaustive = false;
    template <class... Ts>
    void add_plan(Ts&&... xs)
    {
        cps.push_back({std::forward<Ts>(xs)...});
    }
    void update_configs()
    {
        par_compile(cps.size(), [&](auto i) { cps[i].update_config(exhaustive); });
    }
    void compile(module& m)
    {
        std::vector<std::function<void()>> compiles;
        for(auto& cp : cps)
        {
            cp.add_compiles(compiles);
        }
        par_compile(compiles.size(), [&](auto i) { compiles[i](); });
        // Replace and/or benchmark
        for(const auto& cp : cps)
        {
            if(cp.results.empty())
                continue;
            cp.replace(m);
        }
        // Remove compile_plan already executed
        cps.erase(std::remove_if(cps.begin(),
                                 cps.end(),
                                 [](const auto& cp) { return not cp.results.empty(); }),
                  cps.end());
    }
 };
 void compile_ops::apply(module& m) const
 {
    compile_manager cm;
    cm.exhaustive = exhaustive_tune;
    // Find all precompile ops
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::precompile_op")
            continue;
        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
        cm.add_plan(ctx, preop, ins);
    }
    cm.update_configs();
    cm.compile(m);
    // Compile already tuned configs
    cm.compile(m);
    assert(cm.cps.empty());
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compile_pointwise.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compile_pointwise.cpp
@ -0,0 +1,50 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_pointwise.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 operation
 compile_pointwise(context& ctx, const std::vector<migraphx::shape>& in_shapes, const_module_ref pm)
 {
    auto pf            = gen::generate_pointwise(*pm, "inner_pointwise", true);
    std::string lambda = "MIGRAPHX_LIFT(inner_pointwise)";
    auto kernel_name   = gen::generate_name_from_ops(*pm, "kernel");
    return gpu::compile_op("pointwise",
                           ctx,
                           in_shapes,
                           {{"lambda", lambda}, {"preamble", pf}, {"kernel", kernel_name}});
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/compiler.cpp
+++ b/docker/rocm/migraphx/targets/gpu/compiler.cpp
@ -0,0 +1,74 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compiler.hpp>
 #include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace {
 struct compiler_handle
 {
    compiler_compile compile;
    compiler_compile_op compile_op;
    compiler_tuning_config get_tuning_config;
 };
 } // namespace
 auto& compiler_map()
 {
    static std::unordered_map<std::string, compiler_handle> m; // NOLINT
    return m;
 }
 void register_compiler(const std::string& name,
                       compiler_compile c,
                       compiler_compile_op cop,
                       compiler_tuning_config ctg)
 {
    compiler_map()[name] = {std::move(c), std::move(cop), std::move(ctg)};
 }
 bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
 compiler_replace
 compile(context& ctx, instruction_ref ins, const operation& op, const value& solution)
 {
    return compiler_map().at(op.name()).compile(ctx, ins, op, solution);
 }
 operation
 compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
 {
    return compiler_map().at(name).compile_op(ctx, inputs, v);
 }
 optional<tuning_config>
 get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive)
 {
    return compiler_map().at(op.name()).get_tuning_config(ctx, ins, op, exhaustive);
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/argmax.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/argmax.cpp
@ -0,0 +1,52 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/argmax.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/arg_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void argmax(hipStream_t stream,
            const argument& result,
            const argument& arg,
            int64_t axis,
            bool select_last_index)
 {
    if(select_last_index)
        arg_op(argmax_op_last_index{}, stream, result, arg, axis);
    else
        arg_op(argmax_op_first_index{}, stream, result, arg, axis);
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/argmin.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/argmin.cpp
@ -0,0 +1,52 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/argmin.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/arg_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void argmin(hipStream_t stream,
            const argument& result,
            const argument& arg,
            int64_t axis,
            bool select_last_index)
 {
    if(select_last_index)
        arg_op(argmin_op_last_index{}, stream, result, arg, axis);
    else
        arg_op(argmin_op_first_index{}, stream, result, arg, axis);
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/contiguous.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/contiguous.cpp
@ -0,0 +1,65 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/nary.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void contiguous_nonstandard(hipStream_t stream, const argument& result, const argument& arg)
 {
    shape s{result.get_shape().type(), result.get_shape().lens()};
    visit_all(result, arg)([&](auto output_v, auto input_v) {
        hip_visit_views(output_v, input_v, s)([&](auto output, auto input, auto standard_shape) {
            mi_gs_launch(stream,
                         standard_shape)([=](auto idx) __device__ { output[idx] = input[idx]; });
        });
    });
 }
 void contiguous_packed(hipStream_t stream, const argument& result, const argument& arg)
 {
    index_int nelements = result.get_shape().elements();
    visit_all(result, arg)([&](auto output_v, auto input_v) {
        const auto* input = device_cast(input_v.data());
        auto* output      = device_cast(output_v.data());
        gs_launch(stream, nelements)([=](auto i) __device__ { output[i] = input[i]; });
    });
 }
 void contiguous(hipStream_t stream, const argument& result, const argument& arg)
 {
    if(result.get_shape() == arg.get_shape() and result.get_shape().packed())
        contiguous_packed(stream, result, arg);
    else
        contiguous_nonstandard(stream, result, arg);
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/fill.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/fill.cpp
@ -0,0 +1,40 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/fill.hpp>
 #include <migraphx/gpu/device/nary.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void fill(hipStream_t stream, const argument& result, unsigned long val)
 {
    nary(stream, result)([=]() __device__ { return val; });
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@ -0,0 +1,185 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                                    \
    MIGRAPHX_DEVICE_CONSTEXPR hip_array& operator op(const hip_array& x)                           \
    {                                                                                              \
        for(index_int i = 0; i < N; i++)                                                           \
            d[i] op x[i];                                                                          \
        return *this;                                                                              \
    }                                                                                              \
    MIGRAPHX_DEVICE_CONSTEXPR hip_array& operator op(const T& x)                                   \
    {                                                                                              \
        for(index_int i = 0; i < N; i++)                                                           \
            d[i] op x;                                                                             \
        return *this;                                                                              \
    }                                                                                              \
    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(hip_array x, const hip_array& y) \
    {                                                                                              \
        return x op y;                                                                             \
    }                                                                                              \
    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(hip_array x, const T& y)         \
    {                                                                                              \
        return x op y;                                                                             \
    }                                                                                              \
    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(const T& y, hip_array x)         \
    {                                                                                              \
        return x op y;                                                                             \
    }
 template <class T, index_int N>
 struct hip_array
 {
    T d[N];
    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](index_int i) { return d[i]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](index_int i) const { return d[i]; }
    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[N - 1]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[N - 1]; }
    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR std::integral_constant<index_int, N> size() const { return {}; }
    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
    MIGRAPHX_DEVICE_CONSTEXPR T dot(const hip_array& x) const
    {
        T result = 0;
        for(index_int i = 0; i < N; i++)
            result += x[i] * d[i];
        return result;
    }
    MIGRAPHX_DEVICE_CONSTEXPR T product() const
    {
        T result = 1;
        for(index_int i = 0; i < N; i++)
            result *= d[i];
        return result;
    }
    MIGRAPHX_DEVICE_CONSTEXPR T single(index_int width = 100) const
    {
        T result = 0;
        T a      = 1;
        for(index_int i = 0; i < N; i++)
        {
            result += d[N - i - 1] * a;
            a *= width;
        }
        return result;
    }
    MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
    MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
    MIGRAPHX_DEVICE_ARRAY_OP(/=, /)
    MIGRAPHX_DEVICE_ARRAY_OP(%=, %)
    MIGRAPHX_DEVICE_ARRAY_OP(&=, &)
    MIGRAPHX_DEVICE_ARRAY_OP(|=, |)
    MIGRAPHX_DEVICE_ARRAY_OP(^=, ^)
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator==(const hip_array& x, const hip_array& y)
    {
        for(index_int i = 0; i < N; i++)
        {
            if(x[i] != y[i])
                return false;
        }
        return true;
    }
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator!=(const hip_array& x, const hip_array& y)
    {
        return not(x == y);
    }
    // This uses the product order rather than lexical order
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<(const hip_array& x, const hip_array& y)
    {
        for(index_int i = 0; i < N; i++)
        {
            if(not(x[i] < y[i]))
                return false;
        }
        return true;
    }
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator>(const hip_array& x, const hip_array& y)
    {
        return y < x;
    }
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<=(const hip_array& x, const hip_array& y)
    {
        return (x < y) or (x == y);
    }
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator>=(const hip_array& x, const hip_array& y)
    {
        return (y < x) or (x == y);
    }
    MIGRAPHX_DEVICE_CONSTEXPR hip_array carry(hip_array result) const
    {
        uint32_t overflow = 0;
        for(std::ptrdiff_t i = result.size() - 1; i > 0; i--)
        {
            auto z = result[i] + overflow;
            // Reset overflow
            overflow = 0;
            // Compute overflow using while loop instead of mod
            while(z >= d[i])
            {
                z -= d[i];
                overflow += 1;
            }
            result[i] = z;
        }
        result[0] += overflow;
        return result;
    }
 };
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
@ -0,0 +1,70 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_FAST_DIV_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_FAST_DIV_HPP
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 constexpr const uint64_t fast_div_shift = 42;
 inline uint64_t encode_divisor(uint64_t divisor)
 {
    if(divisor == 0)
        return 0;
    auto p = uint64_t{1} << fast_div_shift;
    return (p + divisor - 1) / divisor;
 }
 inline constexpr bool is_divisor_encodable(uint64_t i)
 {
    return i < (uint64_t{1} << (fast_div_shift / 2));
 }
 MIGRAPHX_DEVICE_CONSTEXPR uint64_t fast_div(uint64_t dividend, uint64_t encoded_divisor)
 {
    return (dividend * encoded_divisor) >> fast_div_shift;
 }
 MIGRAPHX_DEVICE_CONSTEXPR uint64_t remainder(uint64_t result, uint64_t dividend, uint64_t divisor)
 {
    return dividend - divisor * result;
 }
 MIGRAPHX_DEVICE_CONSTEXPR uint64_t fast_mod(uint64_t dividend,
                                            uint64_t divisor,
                                            uint64_t encoded_divisor)
 {
    return remainder(fast_div(dividend, encoded_divisor), dividend, divisor);
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
@ -0,0 +1,74 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
 #include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class... Ts>
 using common_type = typename std::common_type<Ts...>::type;
 template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
 __device__ bool float_equal_device(T x, T y)
 {
    return std::isfinite(x) and std::isfinite(y) and
           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
 }
 template <>
 __device__ bool float_equal_device(__bf16 x, __bf16 y) // NOLINT(misc-definitions-in-headers)
 {
    float xf = x;
    float yf = y;
    return std::isfinite(xf) and std::isfinite(yf) and
           std::nextafter(xf, std::numeric_limits<float>::lowest()) <= yf and
           std::nextafter(xf, std::numeric_limits<float>::max()) >= yf;
 }
 template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
 __device__ bool float_equal_device(T x, T y)
 {
    return x == y;
 }
 template <class T, class U>
 __device__ bool float_equal(T x, U y)
 {
    return float_equal_device<common_type<T, U>>(x, y);
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@ -0,0 +1,146 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
 #include <hip/hip_runtime.h>
 #include <migraphx/config.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/targets.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 struct index
 {
    index_int global = 0;
    index_int local  = 0;
    index_int group  = 0;
    __device__ index_int nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
    __device__ index_int nlocal() const { return blockDim.x; }              // NOLINT
    template <class F>
    __device__ void global_stride(index_int n, F f) const
    {
        const auto stride = nglobal();
        for(index_int i = global; i < n; i += stride)
        {
            f(i);
        }
    }
    template <class F>
    __device__ void local_stride(index_int n, F f) const
    {
        const auto stride = nlocal();
        for(index_int i = local; i < n; i += stride)
        {
            f(i);
        }
    }
 };
 template <class F>
 __global__ void launcher(F f)
 {
    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
    f(idx);
 }
 inline auto launch(hipStream_t stream, index_int global, index_int local)
 {
    return [=](auto f) {
        assert(local > 0);
        assert(global > 0);
        using f_type = decltype(f);
        dim3 nblocks(global / local);
        dim3 nthreads(local);
        /*
        hipGetLastError() returns error for the first failed HIP call that happened previously.
        MIGraphX calls into various backend libraries and failed HIP calls can also happen there.
        Calling hipGetLastError() would reset error code to hipSuccess, so that inside MIGraphX
        failed call to hipLaunchKernelGGL() can be captured.
        */
        hipError_t flush_call = hipGetLastError();
        (void)(flush_call);
        // cppcheck-suppress migraphx-UseDeviceLaunch
        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
        hipError_t kernel_launch_status = hipGetLastError();
        if(kernel_launch_status != hipSuccess)
        {
            std::string message = hipGetErrorString(kernel_launch_status);
            if(not contains(get_targets(), get_device_name()))
            {
                message += ". Trying to run a kernel for " + get_device_name() +
                           " but MIGraphX was built for targets " + get_targets_as_string() +
                           ". Please rebuild MIGraphX with -DGPU_TARGETS='" + get_device_name() +
                           "'.";
            }
            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " + message);
        }
    };
 }
 template <class F>
 MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index idx) -> decltype(f(i, idx))
 {
    return f(i, idx);
 }
 template <class F>
 MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(f(i))
 {
    return f(i);
 }
 inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
 {
    index_int groups = (n + local - 1) / local;
    // max possible number of blocks is set to 1B (1,073,741,824)
    index_int nglobal = std::min<index_int>(1073741824, groups) * local;
    return [=](auto f) {
        launch(stream, nglobal, local)([=](auto idx) __device__ {
            idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); });
        });
    };
 }
 #ifdef MIGRAPHX_USE_CLANG_TIDY
 #define MIGRAPHX_DEVICE_SHARED
 #else
 #define MIGRAPHX_DEVICE_SHARED __shared__
 #endif
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@ -0,0 +1,164 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_MULTI_INDEX_HPP
 #define MIGRAPHX_GUARD_RTGLIB_MULTI_INDEX_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/shape.hpp>
 #include <migraphx/functional.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <index_int N>
 struct multi_index
 {
    using hip_index = hip_array<index_int, N>;
    hip_index id{};
    hip_index stride{};
    MIGRAPHX_DEVICE_CONSTEXPR auto for_stride(hip_index n) const
    {
        // f should return void, but this helps with type deduction
        return [=](auto f) -> decltype(f(hip_index{})) {
            for(hip_index i = id; i < n; i = n.carry(i + stride))
            {
                f(i);
            }
        };
    }
 };
 template <class ForStride>
 __device__ __host__ auto deduce_for_stride(ForStride fs) -> decltype(fs(id{}));
 MIGRAPHX_DEVICE_CONSTEXPR multi_index<1> make_multi_index(index_int i, index_int n)
 {
    return {{i}, {n}};
 }
 template <index_int N>
 MIGRAPHX_DEVICE_CONSTEXPR multi_index<N>
 make_multi_index(const hip_shape<N>& s, index_int i, index_int n)
 {
    return {s.multi(i), s.multi(n)};
 }
 template <index_int N>
 MIGRAPHX_DEVICE_CONSTEXPR multi_index<N>
 make_multi_index(const hip_shape<N>& s, index_int i, const hip_array<index_int, N>& n)
 {
    return {s.multi(i), n};
 }
 template <index_int N>
 inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
 {
    assert(s.standard);
    assert(s.elements() > 0);
    index_int n      = s.elements();
    index_int groups = (n + nlocal - 1) / nlocal;
    // max possible number of blocks is set to 1B (1,073,741,824)
    index_int nglobal = std::min<index_int>(1073741824, groups) * nlocal;
    assert(groups > 0);
    assert(nglobal > 0);
    auto nglobal_multi = s.multi(nglobal);
    // Skip checking this, since this will cause metadata to not be generated
    // for some unknown reason.
    //
    // assert(std::any_of(nglobal_multi.begin(), nglobal_multi.end(), [](auto x){return x>0;}));
    // cppcheck-suppress migraphx-RedundantLocalVariable
    return nglobal_multi;
 }
 template <index_int N>
 inline auto mi_nlocal(const hip_shape<N>& s, index_int local)
 {
    assert(s.standard);
    assert(s.elements() > 0);
    auto nlocal_multi = s.multi(local);
    // Skip checking this, since this will cause metadata to not be generated
    // for some unknown reason.
    //
    // assert(std::any_of(nlocal_multi.begin(), nlocal_multi.end(), [](auto x){return x>0;}));
    // cppcheck-suppress migraphx-RedundantLocalVariable
    return nlocal_multi;
 }
 template <index_int N>
 inline auto mi_launch(hipStream_t stream, const hip_shape<N>& global, index_int nlocal = 1024)
 {
    auto nglobal_multi = mi_nglobal(global, nlocal);
    auto nglobal       = global.index(nglobal_multi);
    return [=](auto f) {
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
            auto midx = make_multi_index(global, idx.global, nglobal_multi);
            f(idx, midx.for_stride(global.lens));
        });
    };
 }
 template <index_int N>
 inline auto mi_launch(hipStream_t stream,
                      const hip_shape<N>& global,
                      const hip_shape<N>& local,
                      index_int nlocal = 1024)
 {
    auto nglobal_multi = mi_nglobal(global, 1);
    auto nglobal       = global.index(nglobal_multi);
    auto nlocal_multi  = mi_nlocal(local, nlocal);
    return [=](auto f) {
        launch(stream, nglobal * nlocal, nlocal)([=](auto idx) {
            // TODO: Use fast div for nlocal
            auto midx = make_multi_index(global, idx.global / nlocal, nglobal_multi);
            auto lidx = make_multi_index(local, idx.local, nlocal_multi);
            f(idx, midx.for_stride(global.lens), lidx.for_stride(local.lens));
        });
    };
 }
 template <index_int N>
 inline auto mi_gs_launch(hipStream_t stream, const hip_shape<N>& global, index_int nlocal = 1024)
 {
    return [=](auto f) {
        mi_launch(stream, global, nlocal)([=](auto, auto g) { g([&](auto i) { f(i); }); });
    };
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@ -0,0 +1,473 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/multi_index.hpp>
 #include <migraphx/gpu/device/visit.hpp>
 #include <migraphx/functional.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/array.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/config.hpp>
 #include <iostream>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_NARY);
 // NOLINTNEXTLINE
 #define MIGRAPHX_TRACE_NARY_FUNCTION   \
    if(enabled(MIGRAPHX_TRACE_NARY{})) \
        std::cout << "nary device function: " << __PRETTY_FUNCTION__ << std::endl;
 template <class... Ts>
 constexpr auto pack(Ts... xs)
 {
    return [=](auto f) { return f(xs...); };
 }
 template <class F, class... Arguments>
 auto nary_nonstandard_nonpacked_impl(hipStream_t stream, F f, argument result, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    shape s{result.get_shape().type(), result.get_shape().lens()};
    hip_visit_all(s, result, args...)([&](auto standard_shape, auto output, auto... inputs) {
        mi_gs_launch(stream,
                     standard_shape)([=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
    });
 }
 inline auto create_broadcast_index(index_int len, index_int stride)
 {
    auto next_stride   = stride * len;
    auto e_next_stride = encode_divisor(next_stride);
    auto e_stride      = encode_divisor(stride);
    return [=](auto i) __device__ {
        // ( i % next_stride) / stride
        return fast_div(i, e_stride) - len * fast_div(i, e_next_stride);
    };
 }
 template <class F, class... Arguments>
 auto nary_nonstandard_packed_impl(hipStream_t stream,
                                  F f,
                                  const argument& result,
                                  Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    auto arg_shape = make_array(args...).front().get_shape();
    auto perm      = find_permutation(arg_shape);
    auto s         = reorder_shape(arg_shape, perm);
    hip_visit_all(s, result.reshape(reorder_shape(result.get_shape(), perm)), args.reshape(s)...)(
        [&](auto standard_shape, auto output, auto... inputs) {
            mi_gs_launch(stream, standard_shape)(
                [=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
        });
 }
 template <class F, class... Arguments>
 void nary_broadcast_vec_impl(
    hipStream_t stream, F f, argument result, argument barg, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    const auto& output_shape = result.get_shape();
    const auto& b_shape      = barg.get_shape();
    auto bdim =
        std::distance(b_shape.strides().begin(),
                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
                          return x != 0;
                      }));
    auto bdim_len      = output_shape.lens()[bdim];
    auto bdim_stride   = output_shape.strides()[bdim];
    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
    const index_int vec_size     = 4;
    const index_int nlocal       = 1024;
    const index_int nglobal      = 256 * nlocal;
    const index_int bdim_vec_len = bdim_len / vec_size;
    hip_vec_visit_all<vec_size>(result, barg, args...)(
        [&](auto output, auto binput, auto... inputs) {
            using type                = typename decltype(output)::value_type;
            const index_int nelements = output.size() / vec_size;
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i] = binput.data()[i];
                }
                __syncthreads();
                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx = broadcast_idx(i * vec_size);
                    auto b    = bp[bidx];
                    auto out  = output.data()[i];
                    for(index_int j = 0; j < vec_size; j++)
                    {
                        out[j] = f(inputs.data()[i][j]..., b);
                    }
                    output.data()[i] = out;
                }
            });
        });
 }
 template <class F, class... Arguments>
 void nary_broadcast_impl(hipStream_t stream, F f, argument result, argument barg, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    const auto& output_shape = result.get_shape();
    const auto& b_shape      = barg.get_shape();
    auto bdim =
        std::distance(b_shape.strides().begin(),
                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
                          return x != 0;
                      }));
    auto bdim_len      = output_shape.lens()[bdim];
    auto bdim_stride   = output_shape.strides()[bdim];
    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
    const index_int nlocal  = 1024;
    const index_int nglobal = 256 * nlocal;
    index_int nelements     = result.get_shape().elements();
    hip_visit_all(result, barg, args...)([&](auto output, auto binput, auto... inputs) {
        using type = typename decltype(output)::value_type;
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
            MIGRAPHX_DEVICE_SHARED type buffer[2048];
            // Load bias into LDS
            for(size_t i = idx.local; i < bdim_len; i += nlocal)
            {
                buffer[i] = binput.data()[i];
            }
            __syncthreads();
            // Process the data
            for(size_t i = idx.global; i < nelements; i += nglobal)
            {
                auto bidx        = broadcast_idx(i);
                auto b           = buffer[bidx];
                output.data()[i] = f(inputs.data()[i]..., b);
            }
        });
    });
 }
 template <class F, class... Arguments>
 void nary_double_broadcast_vec_impl(
    hipStream_t stream, F f, argument result, argument barg1, argument barg2, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    assert(barg1.get_shape().broadcasted());
    assert(barg2.get_shape().broadcasted());
    assert(barg1.get_shape() == barg2.get_shape());
    const auto& output_shape = result.get_shape();
    const auto& b_shape      = barg1.get_shape();
    auto bdim =
        std::distance(b_shape.strides().begin(),
                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
                          return x != 0;
                      }));
    auto bdim_len      = output_shape.lens()[bdim];
    auto bdim_stride   = output_shape.strides()[bdim];
    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
    const index_int vec_size     = 4;
    const index_int nlocal       = 1024;
    const index_int nglobal      = 256 * nlocal;
    const index_int bdim_vec_len = bdim_len / vec_size;
    hip_vec_visit_all<vec_size>(result, barg1, barg2, args...)(
        [&](auto output, auto binput1, auto binput2, auto... inputs) {
            using type                = typename decltype(output)::value_type;
            const index_int nelements = output.size() / vec_size;
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i] = binput1.data()[i];
                }
                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i + bdim_vec_len] = binput2.data()[i];
                }
                __syncthreads();
                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx = broadcast_idx(i * vec_size);
                    auto b1   = bp[bidx];
                    auto b2   = bp[bidx + bdim_len];
                    auto out  = output.data()[i];
                    for(index_int j = 0; j < vec_size; j++)
                    {
                        out[j] = f(inputs.data()[i][j]..., b2, b1);
                    }
                    output.data()[i] = out;
                }
            });
        });
 }
 template <class F, class... Arguments>
 void nary_double_broadcast_impl(
    hipStream_t stream, F f, argument result, argument barg1, argument barg2, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    assert(barg1.get_shape().broadcasted());
    assert(barg2.get_shape().broadcasted());
    assert(barg1.get_shape() == barg2.get_shape());
    const auto& output_shape = result.get_shape();
    const auto& b_shape      = barg1.get_shape();
    auto bdim =
        std::distance(b_shape.strides().begin(),
                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
                          return x != 0;
                      }));
    auto bdim_len      = output_shape.lens()[bdim];
    auto bdim_stride   = output_shape.strides()[bdim];
    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
    const index_int nlocal  = 1024;
    const index_int nglobal = 256 * nlocal;
    index_int nelements     = result.get_shape().elements();
    hip_visit_all(result, barg1, barg2, args...)(
        [&](auto output, auto binput1, auto binput2, auto... inputs) {
            using type = typename decltype(output)::value_type;
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048];
                // Load bias into LDS
                for(size_t i = idx.local; i < bdim_len; i += nlocal)
                {
                    buffer[i] = binput1.data()[i];
                }
                for(size_t i = idx.local; i < bdim_len; i += nlocal)
                {
                    buffer[i + bdim_len] = binput2.data()[i];
                }
                __syncthreads();
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx        = broadcast_idx(i);
                    auto b1          = buffer[bidx];
                    auto b2          = buffer[bidx + bdim_len];
                    output.data()[i] = f(inputs.data()[i]..., b2, b1);
                }
            });
        });
 }
 template <class F, class... Arguments>
 void nary_standard_vec_impl(hipStream_t stream, F f, argument result, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    const auto& output_shape = result.get_shape();
    visit_all(result, args...)([&](auto output, auto... inputs) {
        using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
        const index_int vec_size = 4;
        auto data                = pack_vec<4>(device_cast(inputs.data())...);
        auto* outp               = as_vec<4>(device_cast(output.data()));
        gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) __device__ {
            vec<type, 4> out = outp[i];
            data(
                [&](auto... xs) {
                    for(index_int j = 0; j < vec_size; j++)
                    {
                        out[j] = f(xs[j]...);
                    }
                },
                i);
            outp[i] = out;
        });
    });
 }
 template <class F, class... Arguments>
 void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    index_int nelements = result.get_shape().elements();
    hip_pointer_visit_all(result, args...)([&](auto output, auto... inputs) {
        gs_launch(stream, nelements)([=](auto i) __device__ { output[i] = f(inputs[i]...); });
    });
 }
 template <class F, class... Arguments>
 void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
 {
    MIGRAPHX_TRACE_NARY_FUNCTION
    const auto shapes   = make_array(args.get_shape()...);
    const bool standard = all_of(shapes, [](const shape& s) { return s.standard(); });
    const bool packed =
        all_of(shapes, [](const shape& s) { return s.packed() and not s.broadcasted(); });
    const bool same_shapes =
        all_of(shapes, [&](const shape& s) { return s == result.get_shape(); });
    const bool same_input_shapes = all_of(shapes, [&](const shape& s) { return s == shapes[0]; });
    if((result.get_shape().standard() and standard) or (packed and same_shapes))
        nary_standard_impl(stream, f, result, args...);
    else if(packed and same_input_shapes)
        nary_nonstandard_packed_impl(stream, f, result, args...);
    else
        nary_nonstandard_nonpacked_impl(stream, f, result, args...);
 }
 template <class... Arguments>
 auto nary_nonstandard(hipStream_t stream, argument result, Arguments... args)
 {
    return [=](auto f) { nary_nonstandard_nonpacked_impl(stream, f, result, args...); };
 }
 template <class... Arguments>
 auto nary_standard(hipStream_t stream, argument result, Arguments... args)
 {
    return [=](auto f) { nary_standard_impl(stream, f, result, args...); };
 }
 template <class... Arguments>
 bool broadcastable(bool& divisible_by_4,
                   index_int max_size,
                   const argument& result,
                   const argument& barg,
                   const Arguments&... args)
 {
    divisible_by_4 = false;
    auto bshape    = barg.get_shape();
    const bool standard =
        all_of({args.get_shape()...}, [](const shape& s) { return s.standard(); });
    const bool same_shapes =
        all_of({args.get_shape()...}, [&](const shape& s) { return s == result.get_shape(); });
    // TODO: Check result and args shape is the same
    if(standard and same_shapes and bshape.broadcasted() and not bshape.scalar())
    {
        auto not_zero       = [](auto x) { return x != 0; };
        const auto& strides = bshape.strides();
        auto b_it           = std::find_if(strides.begin(), strides.end(), not_zero);
        auto b_idx          = std::distance(strides.begin(), b_it);
        auto b_len          = result.get_shape().lens()[b_idx];
        auto b_stride       = result.get_shape().strides()[b_idx];
        assert(bshape.lens()[b_idx] == b_len);
        if(b_len <= max_size and std::none_of(std::next(b_it), strides.end(), not_zero) and
           is_divisor_encodable(b_stride * b_len))
        {
            divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
                             (front_args(args...).get_shape().elements() % 4 == 0);
            return true;
        }
    }
    return false;
 }
 inline bool broadcastable(bool& divisible_by_4, index_int, const argument&, const argument&)
 {
    divisible_by_4 = false;
    return false;
 }
 // Nullary
 inline auto nary(hipStream_t stream, argument result)
 {
    return [=](auto f) { nary_standard_impl(stream, f, result); };
 }
 // Unary
 inline auto nary(hipStream_t stream, argument result, argument arg)
 {
    return [=](auto f) { nary_impl(stream, f, result, arg); };
 }
 // Binary
 inline auto nary(hipStream_t stream, argument result, argument arg, argument barg)
 {
    return [=](auto f) {
        bool divisible_by_4 = false;
        if(broadcastable(divisible_by_4, 2048, result, barg, arg))
        {
            if(divisible_by_4)
                nary_broadcast_vec_impl(stream, f, result, barg, arg);
            else
                nary_broadcast_impl(stream, f, result, barg, arg);
        }
        else
        {
            nary_impl(stream, f, result, arg, barg);
        }
    };
 }
 template <class... Arguments>
 auto nary(hipStream_t stream, argument result, Arguments... args)
 {
    static_assert(sizeof...(args) > 2, "Args needs to be greater than 2");
    return [=](auto f) {
        auto barg1     = back_args(args...);
        bool fallback1 = pop_back_args(args...)([&](auto&&... args2) {
            auto barg2 = back_args(args2...);
            bool fallback2 =
                barg2.get_shape() != barg1.get_shape() or not barg2.get_shape().broadcasted() or
                pop_back_args(args2...)([&](auto&&... args3) {
                    bool divisible_by_4 = false;
                    if(broadcastable(divisible_by_4, 1024, result, barg2, args3...))
                    {
                        if(divisible_by_4)
                            nary_double_broadcast_vec_impl(
                                stream, f, result, barg1, barg2, args3...);
                        else
                            nary_double_broadcast_impl(stream, f, result, barg1, barg2, args3...);
                        return false;
                    }
                    return true;
                });
            if(not fallback2)
                return false;
            bool divisible_by_4 = false;
            if(broadcastable(divisible_by_4, 2048, result, barg1, args2...))
            {
                if(divisible_by_4)
                    nary_broadcast_vec_impl(stream, f, result, barg1, args2...);
                else
                    nary_broadcast_impl(stream, f, result, barg1, args2...);
                return false;
            }
            return true;
        });
        if(fallback1)
            nary_impl(stream, f, result, args...);
    };
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@ -0,0 +1,311 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/visit.hpp>
 #include <migraphx/gpu/device/multi_index.hpp>
 #include <migraphx/gpu/device/reduce_ops.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
          class T,
          class ForStride,
          class F,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
    using type = decltype(f(deduce_for_stride(fs)));
    MIGRAPHX_DEVICE_SHARED type buffer[N];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    buffer[idx.local] = x;
    __syncthreads();
    for(index_int s = 1; s < idx.nlocal(); s *= 2)
    {
        const index_int index = 2 * s * idx.local;
        if(index + s < idx.nlocal())
        {
            buffer[index] = op(buffer[index], buffer[index + s]);
        }
        __syncthreads();
    }
    return buffer[0];
 }
 #else
 constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
 constexpr unsigned int dpp_row_bcast(unsigned int x)
 {
    unsigned int y = 0;
    switch(x)
    {
    case 15: y = 0x142; break;
    case 31: y = 0x143; break;
    default: throw std::runtime_error("Unknown bcast");
    }
    return y;
 }
 template <unsigned int DppCtrl,
          unsigned int RowMask  = 0xf,
          unsigned int BankMask = 0xf,
          bool BoundCtrl        = false,
          class T>
 __device__ T dpp_mov(T& x)
 {
    static const index_int n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
    union type
    {
        uint32_t reg[n];
        T data;
    };
    type output{};
    type input{};
    // cppcheck-suppress unreadVariable
    input.data = x;
    for(index_int i = 0; i < n; i++)
    {
        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
    }
    return output.data;
 }
 template <class T, class Op>
 __device__ void dpp_reduce(T& in, Op op)
 {
    T out{};
    out = dpp_mov<dpp_row_shr(1)>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(2)>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
 #if __AMDGCN_WAVEFRONT_SIZE == 64
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
    in  = op(in, out);
 #endif
 }
 __device__ inline void dpp_reduce(float& x, sum)
 {
 #if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
    x = 1;
 #else
    __asm__ volatile("s_nop 4\n"
                     "v_add_f32 %0 %0 %0 row_shr:1\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:2\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                     "s_nop 1\n"
 #if __AMDGCN_WAVEFRONT_SIZE == 64
                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
 #endif
                     "s_nop 1\n"
                     : "=v"(x)
                     : "0"(x));
 #endif
 }
 template <index_int N,
          class Op,
          class T,
          class ForStride,
          class F,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
 #if __AMDGCN_WAVEFRONT_SIZE == 32
    constexpr index_int nthreads = 16;
 #else
    constexpr index_int nthreads = 64;
 #endif
    using type                   = decltype(f(deduce_for_stride(fs)));
    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);
    const auto ldsidx = idx.local / nthreads;
    if((idx.local % nthreads) == nthreads - 1)
    {
        buffer[ldsidx] = x;
    }
    __syncthreads();
    type y = init;
    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
    {
        y = op(y, buffer[i]);
    }
    return y;
 }
 #endif
 template <index_int N, class Op, class T, class F>
 __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
 {
    auto midx = make_multi_index(idx.local, idx.nlocal());
    // Workaround hcc, create a local array
    auto fs = midx.id;
    fs[0]   = n;
    return block_reduce<N>(
        idx, op, init, midx.for_stride(fs), [&](auto mi) __device__ { return f(mi[0]); });
 }
 constexpr index_int compute_block_size(index_int n, index_int max_block_size)
 {
    size_t block_size = 64;
    while(block_size < max_block_size and block_size < n)
        block_size *= 2;
    return block_size;
 }
 inline std::vector<index_int> get_reduce_lens(const std::vector<size_t>& input_lens,
                                              const std::vector<size_t>& output_lens)
 {
    std::vector<index_int> reduce_lens;
    std::transform(output_lens.begin(),
                   output_lens.end(),
                   input_lens.begin(),
                   std::back_inserter(reduce_lens),
                   [](auto x, auto y) -> index_int {
                       if(x == y)
                           return 1;
                       else
                           return y;
                   });
    return reduce_lens;
 }
 template <class Op, class T, class Input, class Output>
 void reduce_multi_impl(hipStream_t stream,
                       const argument& result,
                       const argument& arg,
                       Op op,
                       T init,
                       Input read_input,
                       Output read_output,
                       const shape& reduce_slice)
 {
    hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
        auto relements = reduce_slice.elements();
        const index_int max_block_size = 256;
        const index_int block_size     = compute_block_size(relements, max_block_size);
        mi_launch(stream, output.get_shape(), reduce_shape, block_size)(
            [=](auto idx, auto global, auto local) __device__ {
                global([&](auto i) __device__ {
                    auto r =
                        block_reduce<max_block_size>(idx, op, init, local, [&](auto j) __device__ {
                            return read_input(input[i + j]);
                        });
                    if(idx.local == 0)
                        output[i] = read_output(r);
                });
            });
    });
 }
 template <class Op, class T, class Input, class Output>
 void reduce_standard_impl(hipStream_t stream,
                          const argument& result,
                          const argument& arg,
                          Op op,
                          T init,
                          Input read_input,
                          Output read_output,
                          index_int relements)
 {
    hip_visit_all(result, arg)([&](auto output, auto input) {
        auto nelements = result.get_shape().elements();
        const index_int max_block_size = 256;
        const index_int block_size     = compute_block_size(relements, max_block_size);
        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
            const auto out_idx  = i / block_size;
            const auto base_idx = out_idx * relements;
            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
                return read_input(input.data()[base_idx + j]);
            });
            if(idx.local == 0)
                output.data()[out_idx] = read_output(r);
        });
    });
 }
 template <class Op, class T, class Input, class Output>
 void reduce(hipStream_t stream,
            const argument& result,
            const argument& arg,
            Op op,
            T init,
            Input read_input,
            Output read_output)
 {
    auto&& output_shape = result.get_shape();
    auto&& input_shape  = arg.get_shape();
    auto input_lens     = input_shape.lens();
    auto output_lens    = output_shape.lens();
    assert(output_lens.size() == input_lens.size());
    if(input_shape.standard() and output_shape.standard() and
       output_lens.back() != input_lens.back() and
       std::equal(output_lens.begin(), std::prev(output_lens.end()), input_lens.begin()))
    {
        reduce_standard_impl(
            stream, result, arg, op, init, read_input, read_output, input_lens.back());
    }
    else
    {
        std::vector<index_int> reduce_lens = get_reduce_lens(input_lens, output_lens);
        shape reduce_slice{output_shape.type(), reduce_lens};
        reduce_multi_impl(stream, result, arg, op, init, read_input, read_output, reduce_slice);
    }
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_NO_DPP
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
@ -0,0 +1,111 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
 #define MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 struct sum
 {
    template <class T, class U>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
    {
        return x + y;
    }
 };
 struct product
 {
    template <class T, class U>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
    {
        return x * y;
    }
 };
 struct id
 {
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
    {
        return x;
    }
 };
 struct mean
 {
    size_t item_num = 1;
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
    {
        return x / static_cast<T>(item_num);
    }
 };
 struct max
 {
    template <class T, class U>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
    {
        return (x > y) ? x : y;
    }
 };
 struct min
 {
    template <class T, class U>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
    {
        return (x < y) ? x : y;
    }
 };
 struct lowest
 {
    template <class T>
    __device__ __host__ operator T() const
    {
        return device_cast(std::numeric_limits<host_type<T>>::lowest());
    }
 };
 struct highest
 {
    template <class T>
    __device__ __host__ operator T() const
    {
        return device_cast(std::numeric_limits<host_type<T>>::max());
    }
 };
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@ -0,0 +1,97 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_DEVICE_SCAN_HPP
 #define MIGRAPHX_GUARD_DEVICE_SCAN_HPP
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/visit.hpp>
 #include <migraphx/gpu/device/multi_index.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <index_int N,
          class Op,
          class T,
          class ForStride,
          class Input,
          class Output,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input, Output output)
 {
    using type = decltype(input(deduce_for_stride(fs)));
    MIGRAPHX_DEVICE_SHARED type buffer[2][N];
    type x = init;
    fs([&](auto i) {
        index_int iout = 0;
        index_int iin  = 1;
        if(idx.local == 0)
            buffer[iout][idx.local] = op(input(i), x);
        else
            buffer[iout][idx.local] = input(i);
        __syncthreads();
        for(index_int s = 1; s < idx.nlocal(); s *= 2)
        {
            iout = 1 - iout;
            iin  = 1 - iin;
            if(idx.local >= s)
            {
                buffer[iout][idx.local] = op(buffer[iin][idx.local], buffer[iin][idx.local - s]);
            }
            else
            {
                buffer[iout][idx.local] = buffer[iin][idx.local];
            }
            __syncthreads();
        }
        x = buffer[iout][idx.nlocal() - 1];
        output(i, buffer[iout][idx.local]);
    });
 }
 template <index_int N, class Op, class T, class Input, class Output>
 __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
 {
    block_scan<N>(
        idx,
        op,
        init,
        [&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
        input,
        output);
 }
 template <class F>
 constexpr auto reverse_scan(index_int n, F f)
 {
    return [=](auto i, auto&&... xs) { return f(n - i - 1, xs...); };
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_DEVICE_SCAN_HPP
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
@ -0,0 +1,120 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
 #include <migraphx/gpu/device/array.hpp>
 #include <migraphx/gpu/device/fast_div.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <index_int N>
 struct hip_shape
 {
    using hip_index                  = hip_array<index_int, N>;
    hip_index lens                   = {};
    hip_index strides                = {};
    hip_array<std::uint64_t, N> divs = {};
    bool standard                    = false;
    __device__ __host__ hip_shape() = default;
    hip_shape(const shape& s) : standard(s.standard())
    {
        assert(s.lens().size() == N);
        assert(s.strides().size() == N);
        std::copy(s.lens().begin(), s.lens().end(), lens.begin());
        std::copy(s.strides().begin(), s.strides().end(), strides.begin());
        assert(std::all_of(s.lens().begin(), s.lens().end(), &is_divisor_encodable));
        std::transform(s.lens().begin(), s.lens().end(), divs.begin(), &encode_divisor);
    }
    MIGRAPHX_DEVICE_CONSTEXPR index_int elements() const { return lens.product(); }
    MIGRAPHX_DEVICE_CONSTEXPR index_int index(hip_index x) const { return x.dot(strides); }
    MIGRAPHX_DEVICE_CONSTEXPR index_int index(std::initializer_list<index_int> x) const
    {
        index_int idx = 0;
        for(index_int i = 0; i < x.size(); i++)
            idx += *(x.begin() + i) * strides[i];
        return idx;
    }
    MIGRAPHX_DEVICE_CONSTEXPR index_int index(index_int i) const
    {
        if(this->standard)
            return i;
        else
        {
            const index_int rank = this->lens.size();
            index_int s          = 1;
            index_int result     = 0;
            for(index_int j = 0; j < this->lens.size(); j++)
            {
                const index_int k      = rank - j - 1;
                const index_int stride = this->strides[k];
                const index_int len    = this->lens[k];
                const index_int slen   = s * len;
                const index_int idx    = (i % slen) / s;
                result += stride * idx;
                s = slen;
            }
            return result;
        }
    }
    MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(index_int idx) const
    {
        hip_index result;
        index_int tidx = idx;
        for(std::ptrdiff_t is = result.size() - 1; is > 0; is--)
        {
            // result[is] = tidx % lens[is];
            // tidx = tidx / lens[is];
            auto q     = fast_div(tidx, divs[is]);
            result[is] = remainder(q, tidx, lens[is]);
            tidx       = q;
        }
        result[0] = tidx;
        return result;
    }
 };
 template <index_int N>
 hip_shape<N> make_hip_shape(const shape& x)
 {
    return x;
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@ -0,0 +1,76 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #include <migraphx/gpu/device/visit.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <index_int NDim>
 using hip_tensor_index = hip_array<index_int, NDim>;
 template <index_int NDim>
 struct hip_tensor_descriptor
 {
    __device__ __host__ hip_tensor_descriptor() = default;
    hip_tensor_descriptor(const shape& s)
    {
        std::copy(s.lens().begin(), s.lens().end(), lens);
        std::copy(s.strides().begin(), s.strides().end(), strides);
    }
    __device__ __host__ hip_tensor_index<NDim> multi(index_int idx) const
    {
        hip_tensor_index<NDim> result{};
        index_int tidx = idx;
        for(index_int is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
    }
    __device__ __host__ index_int linear(hip_tensor_index<NDim> s) const
    {
        index_int idx = 0;
        for(index_int i = 0; i < NDim; i++)
            idx += s[i] * strides[i];
        return idx;
    }
    index_int lens[NDim]    = {};
    index_int strides[NDim] = {};
 };
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
@ -0,0 +1,82 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
 #include <migraphx/gpu/device/shape.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class T, index_int N>
 struct hip_tensor_view
 {
    using value_type                      = T;
    using hip_index                       = typename hip_shape<N>::hip_index;
    __device__ __host__ hip_tensor_view() = default;
    __host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
    __host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
    MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
    MIGRAPHX_DEVICE_CONSTEXPR index_int size() const { return s.elements(); }
    MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
    template <class U>
    MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
    {
        return d[s.index(i)];
    }
    MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
    private:
    value_type* d = nullptr;
    hip_shape<N> s{};
 };
 template <index_int N, class T>
 hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
 {
    return {x, s};
 }
 template <index_int N, class T>
 hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
 {
    return {x};
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@ -0,0 +1,213 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
 #include <hip/hip_runtime.h>
 #include <migraphx/half.hpp>
 #include <migraphx/bf16.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/tensor_view.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 using index_int = std::uint32_t;
 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
 template <class T, index_int N>
 using vec = T __attribute__((ext_vector_type(N)));
 template <index_int N, class T>
 __device__ __host__ T* as_pointer(vec<T, N>* x)
 {
    return reinterpret_cast<T*>(x);
 }
 template <index_int N, class T>
 __device__ __host__ vec<T, N>* as_vec(T* x)
 {
    return reinterpret_cast<vec<T, N>*>(x);
 }
 template <index_int N, class T>
 tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
 {
    return {x.get_shape(), as_vec<N>(x.data())};
 }
 template <index_int N, class... Ts>
 auto pack_vec(Ts... xs)
 {
    return [=](auto f, index_int n) { return f(as_vec<N>(xs)[n]...); };
 }
 using gpu_half = __fp16;
 using gpu_bf16 = __bf16;
 namespace detail {
 template <class T>
 struct device_type
 {
    using type = T;
 };
 template <class T, index_int N>
 struct device_type<vec<T, N>>
 {
    using type = vec<typename device_type<T>::type, N>;
 };
 template <>
 struct device_type<half>
 {
    using type = gpu_half;
 };
 template <>
 struct device_type<bf16>
 {
    using type = gpu_bf16;
 };
 template <class T>
 struct host_type
 {
    using type = T;
 };
 template <>
 struct host_type<gpu_half>
 {
    using type = half;
 };
 template <>
 struct host_type<gpu_bf16>
 {
    using type = bf16;
 };
 } // namespace detail
 template <class T>
 using host_type = typename detail::host_type<T>::type;
 template <class T>
 using device_type = typename detail::device_type<T>::type;
 template <class T>
 host_type<T> host_cast(T x)
 {
    return reinterpret_cast<const host_type<T>&>(x);
 }
 template <class T>
 host_type<T>* host_cast(T* x)
 {
    return reinterpret_cast<host_type<T>*>(x);
 }
 template <class T>
 __device__ __host__ device_type<T> device_cast(const T& x)
 {
    return reinterpret_cast<const device_type<T>&>(x);
 }
 template <class T>
 __device__ __host__ device_type<T>* device_cast(T* x)
 {
    return reinterpret_cast<device_type<T>*>(x);
 }
 template <class T>
 __device__ __host__ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
 {
    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
 }
 template <class T>
 __device__ __host__ T to_hip_type(T x)
 {
    return x;
 }
 // Hip doens't support __fp16 and __bf16
 inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
 inline __device__ __host__ float to_hip_type(gpu_bf16 x) { return x; }
 template <class X>
 struct is_floating_point : std::is_floating_point<X>
 {
 };
 template <>
 struct is_floating_point<__fp16> : std::true_type
 {
 };
 template <class X>
 struct is_signed : std::is_signed<X>
 {
 };
 template <>
 struct is_signed<__fp16> : std::true_type
 {
 };
 template <class X>
 struct is_arithmetic : std::is_arithmetic<X>
 {
 };
 template <>
 struct is_arithmetic<__fp16> : std::true_type
 {
 };
 // Redo for __bf16
 template <>
 struct is_floating_point<__bf16> : std::true_type
 {
 };
 template <>
 struct is_signed<__bf16> : std::true_type
 {
 };
 template <>
 struct is_arithmetic<__bf16> : std::true_type
 {
 };
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
@ -0,0 +1,99 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
 #include <migraphx/gpu/device/types.hpp>
 #include <vector>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class T, index_int N>
 struct hip_vector
 {
    MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
    MIGRAPHX_DEVICE_CONSTEXPR hip_vector(index_int s) : len(s) {}
    template <class Iterator>
    __device__ __host__ hip_vector(Iterator start, Iterator last)
    {
        auto it = std::copy(start, last, d);
        len     = std::distance(d, it);
    }
    __device__ __host__ hip_vector(std::initializer_list<T> x)
    {
        std::copy(x.begin(), x.end(), d);
        len = x.size();
    }
    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](index_int i) { return d[i]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](index_int i) const { return d[i]; }
    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR index_int size() const { return len; }
    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
    template <class U>
    MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
    {
        d[len] = static_cast<U&&>(x);
        len++;
    }
    private:
    T d[N]        = {};
    index_int len = 0;
 };
 template <index_int N, class T>
 hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
 {
    hip_vector<T, N> result(x.size());
    std::copy(x.begin(), x.end(), result.begin());
    return result;
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@ -0,0 +1,245 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
 #include <migraphx/gpu/device/tensor_view.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class F>
 constexpr void visit_tensor_size(index_int n, F f)
 {
    switch(n)
    {
    case 1: {
        f(std::integral_constant<index_int, 1>{});
        break;
    }
    case 2: {
        f(std::integral_constant<index_int, 2>{});
        break;
    }
    case 3: {
        f(std::integral_constant<index_int, 3>{});
        break;
    }
    case 4: {
        f(std::integral_constant<index_int, 4>{});
        break;
    }
    case 5: {
        f(std::integral_constant<index_int, 5>{});
        break;
    }
    default: throw std::runtime_error("Tensor dims " + std::to_string(n) + " out of range");
    }
 }
 inline shape get_shape(const shape& x) { return x; }
 template <class T>
 auto get_shape(const T& x) -> decltype(x.get_shape())
 {
    return x.get_shape();
 }
 template <class T>
 struct is_hip_type : std::false_type
 {
 };
 template <>
 struct is_hip_type<float> : std::true_type
 {
 };
 template <>
 struct is_hip_type<half> : std::true_type
 {
 };
 template <>
 struct is_hip_type<bool> : std::true_type
 {
 };
 template <>
 struct is_hip_type<std::int8_t> : std::true_type
 {
 };
 template <>
 struct is_hip_type<std::uint8_t> : std::true_type
 {
 };
 template <>
 struct is_hip_type<std::int32_t> : std::true_type
 {
 };
 template <>
 struct is_hip_type<bf16> : std::true_type
 {
 };
 template <class T, class V, MIGRAPHX_REQUIRES(is_hip_type<typename T::type>{})>
 void hip_visitor_invoke(T as, V&& v)
 {
    v(as);
 }
 template <class T, class V, MIGRAPHX_REQUIRES(not is_hip_type<typename T::type>{})>
 void hip_visitor_invoke(T, V&&)
 {
    MIGRAPHX_THROW(std::string("Unsupported data type on GPU: ") + __PRETTY_FUNCTION__);
 }
 template <class V>
 auto hip_visitor(V v)
 {
    return [=](auto as) { hip_visitor_invoke(as, v); };
 }
 template <class V, class F, class... Ts>
 void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
 {
    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
    if(not std::all_of(
           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
        MIGRAPHX_THROW("Types must be the same");
    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
        MIGRAPHX_THROW("Ranks must be the same");
    visit_tensor_size(s.ndim(), [&](auto ndim) {
        s.visit_type(hip_visitor([&](auto as) { v(f(xs, ndim, as)...); }));
    });
 }
 template <class V, class F, class... Ts>
 void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
 {
    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
        MIGRAPHX_THROW("Ranks must be the same");
    visit_tensor_size(s.ndim(), [&](auto ndim) { v(f(xs, ndim)...); });
 }
 template <class F>
 struct hip_convert
 {
    F f;
    template <class RawData, class N, class As>
    auto operator()(RawData x, N ndim, As as) const
        -> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
    {
        return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
    }
    template <class N, class As>
    auto operator()(const shape& s, N ndim, As) const
    {
        return make_hip_shape<ndim>(s);
    }
 };
 template <class F>
 hip_convert<F> make_hip_convert(F f)
 {
    return {f};
 }
 template <class F>
 struct hip_convert_view
 {
    F f;
    template <class T, class N>
    auto operator()(tensor_view<T> x, N ndim) const
    {
        return make_hip_view<ndim>(f(x));
    }
    template <class N>
    auto operator()(const shape& s, N ndim) const
    {
        return make_hip_shape<ndim>(s);
    }
 };
 template <class F>
 hip_convert_view<F> make_hip_convert_view(F f)
 {
    return {f};
 }
 template <class T, class... Ts>
 auto hip_visit_all(T&& x, Ts&&... xs)
 {
    return [&](auto f) {
        hip_visit_all_impl(
            get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
    };
 }
 template <index_int N, class T, class... Ts>
 auto hip_vec_visit_all(T&& x, Ts&&... xs)
 {
    return [&](auto f) {
        auto sx   = get_shape(x);
        auto lens = sx.lens();
        assert(lens.back() % N == 0);
        assert(sx.strides().back() == 1);
        lens.back() /= N;
        shape vec_sx{sx.type(), lens};
        hip_visit_all_impl(vec_sx,
                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
                           f,
                           x,
                           xs...);
    };
 }
 template <class T, class... Ts>
 auto hip_pointer_visit_all(T&& x, Ts&&... xs)
 {
    return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
 }
 template <class T, class... Ts>
 auto hip_visit_views(T&& x, Ts&&... xs)
 {
    return [&](auto f) {
        hip_visit_views_impl(get_shape(x),
                             make_hip_convert_view([](auto v) { return device_cast(v); }),
                             f,
                             x,
                             xs...);
    };
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/docker/rocm/migraphx/targets/gpu/device/logsoftmax.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/logsoftmax.cpp
@ -0,0 +1,80 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
 #include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
    auto batch_lens          = result.get_shape().lens();
    index_int batch_item_num = batch_lens[axis];
    batch_lens[axis]         = 1;
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
        const index_int max_block_size = 256;
        const index_int block_size     = compute_block_size(batch_item_num, max_block_size);
        gs_launch(stream,
                  batch_shape.elements() * block_size,
                  block_size)([=](auto i, auto idx) __device__ {
            auto data_idx = batch.multi(i / block_size);
            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
            type init     = lowest();
            auto batch_max = block_reduce<max_block_size>(
                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
                    return input[data_idx];
                });
            auto batch_sum =
                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
                    auto val       = input[data_idx] - batch_max;
                    return ::exp(to_hip_type(val));
                });
            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
            idx.local_stride(batch_item_num, [&](auto j) __device__ {
                data_idx[axis]   = j;
                output[data_idx] = input[data_idx] - log_batch_sum;
            });
        });
    });
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/multinomial.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/multinomial.cpp
@ -0,0 +1,90 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/multinomial.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class Iterator, class T>
 constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
 {
    Iterator it;
    typename std::iterator_traits<Iterator>::difference_type count;
    typename std::iterator_traits<Iterator>::difference_type step;
    count = std::distance(first, last);
    while(count > 0)
    {
        it   = first;
        step = count / 2;
        std::advance(it, step);
        if(not(value < *it))
        {
            first = ++it;
            count -= step + 1;
        }
        else
            count = step;
    }
    return first;
 }
 void multinomial(hipStream_t stream,
                 const argument& result,
                 const argument& arg0,
                 const argument& arg1)
 {
    size_t batch_size  = arg0.get_shape().lens().front();
    size_t class_size  = arg0.get_shape().lens().back();
    size_t sample_size = result.get_shape().lens().back();
    visit_all(arg0, arg1)([&](auto cdf_host, auto dist_host) {
        result.visit([&](auto output_host) {
            hip_visit_views(cdf_host, dist_host, output_host)(
                [&](auto cdf, auto dist, auto output) {
                    gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
                        auto idx       = output.get_shape().multi(i);
                        auto cdf_begin = cdf.begin() + (idx.front() * class_size);
                        auto cdf_end   = cdf_begin + class_size;
                        auto* sample_iter =
                            upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
                        output[i] = std::distance(cdf_begin, sample_iter);
                    });
                });
        });
    });
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/nonzero.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/nonzero.cpp
@ -0,0 +1,77 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/nonzero.hpp>
 #include <migraphx/gpu/device/float_equal.hpp>
 #include <migraphx/gpu/device/scan.hpp>
 #include <migraphx/gpu/device/reduce_ops.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
 {
    auto s            = arg_data.get_shape();
    auto elem_num     = s.elements();
    auto out_elem_num = result.get_shape().elements();
    // call the prefix_sum function to do a prefix_sum to compute
    // index in the output. Only 1 block can be used since we have
    // only one prefix sum
    const index_int block_size = 256;
    hip_visit_all(arg_data, s)([&](auto input, auto si) {
        const auto* in_ptr = device_cast(input.data());
        auto* ptr          = result.cast<int64_t>();
        gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
            // fill all output to 0 first
            idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
            block_scan<block_size>(
                idx,
                sum{},
                0,
                elem_num,
                [&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
                [&](auto j, auto x) {
                    auto out_loc = x - 1;
                    if(float_equal(in_ptr[j], 0))
                        return;
                    auto index = si.multi(j);
                    for(size_t k = 0; k < index.size(); ++k)
                    {
                        ptr[k * elem_num + out_loc] = index[k];
                    }
                });
        });
    });
    return result;
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/prefix_scan_sum.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/prefix_scan_sum.cpp
@ -0,0 +1,143 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/prefix_scan_sum.hpp>
 #include <migraphx/gpu/device/scan.hpp>
 #include <migraphx/gpu/device/reduce_ops.hpp>
 #include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void prefix_scan_sum(hipStream_t stream,
                     const argument& result,
                     const argument& arg,
                     int32_t axis,
                     bool exclusive,
                     bool reverse)
 {
    const index_int max_block_size = 256;
    const index_int n              = arg.get_shape().lens()[axis];
    auto rlens                     = result.get_shape().lens();
    rlens[axis]                    = 1;
    hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
        [=](auto output, auto input, auto rshape) {
            const index_int block_size = compute_block_size(rshape.elements(), max_block_size);
            if(reverse and exclusive)
            {
                gs_launch(stream, rshape.elements() * block_size, block_size)(
                    [=](auto i, auto idx) __device__ {
                        const auto ridx  = rshape.multi(i / block_size);
                        auto compute_idx = [&](auto j) {
                            auto k  = ridx;
                            k[axis] = j;
                            return k;
                        };
                        block_scan<max_block_size>(
                            idx,
                            sum{},
                            0,
                            n,
                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
                            reverse_scan(n, [&](auto j, auto x) {
                                if(j == n - 1)
                                    output[compute_idx(j)] = 0;
                                if(j > 0)
                                    output[compute_idx(j - 1)] = x;
                            }));
                    });
            }
            else if(reverse)
            {
                gs_launch(stream, rshape.elements() * block_size, block_size)(
                    [=](auto i, auto idx) __device__ {
                        const auto ridx  = rshape.multi(i / block_size);
                        auto compute_idx = [&](auto j) {
                            auto k  = ridx;
                            k[axis] = j;
                            return k;
                        };
                        block_scan<max_block_size>(
                            idx,
                            sum{},
                            0,
                            n,
                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
                            reverse_scan(n, [&](auto j, auto x) { output[compute_idx(j)] = x; }));
                    });
            }
            else if(exclusive)
            {
                gs_launch(stream, rshape.elements() * block_size, block_size)(
                    [=](auto i, auto idx) __device__ {
                        const auto ridx  = rshape.multi(i / block_size);
                        auto compute_idx = [&](auto j) {
                            auto k  = ridx;
                            k[axis] = j;
                            return k;
                        };
                        block_scan<max_block_size>(
                            idx,
                            sum{},
                            0,
                            n,
                            [&](auto j) { return input[compute_idx(j)]; },
                            [&](auto j, auto x) {
                                auto k = j + 1;
                                if(j == 0)
                                    output[compute_idx(0)] = 0;
                                if(k < n)
                                    output[compute_idx(k)] = x;
                            });
                    });
            }
            else
            {
                gs_launch(stream, rshape.elements() * block_size, block_size)(
                    [=](auto i, auto idx) __device__ {
                        const auto ridx  = rshape.multi(i / block_size);
                        auto compute_idx = [&](auto j) {
                            auto k  = ridx;
                            k[axis] = j;
                            return k;
                        };
                        block_scan<max_block_size>(
                            idx,
                            sum{},
                            0,
                            n,
                            [&](auto j) { return input[compute_idx(j)]; },
                            [&](auto j, auto x) { output[compute_idx(j)] = x; });
                    });
            }
        });
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/reverse.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/reverse.cpp
@ -0,0 +1,66 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include "migraphx/gpu/device/visit.hpp"
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/reverse.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 argument
 reverse(hipStream_t stream, argument result, argument arg1, const std::vector<int64_t>& axes)
 {
    auto s = arg1.get_shape();
    // auto lens             = s.lens();
    std::vector<std::size_t> axis_len(axes.begin(), axes.end());
    shape sa{shape::float_type, axis_len};
    std::size_t nelements = s.elements();
    visit_all(result, arg1)([&](auto output1, auto input1) {
        hip_visit_views(output1, input1, s)([&](auto output, auto input, auto hs) {
            hip_visit_views(sa)([&](auto daxes) {
                auto lens = hs.lens;
                gs_launch(stream, nelements)([=](auto i) __device__ {
                    auto idx    = hs.multi(i);
                    auto in_idx = idx;
                    for(auto axis : daxes.lens)
                        in_idx[axis] = lens[axis] - 1 - idx[axis];
                    output[idx] = input[in_idx];
                });
            });
        });
    });
    return result;
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/rnn_variable_seq_lens.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/rnn_variable_seq_lens.cpp
@ -0,0 +1,140 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/rnn_variable_seq_lens.hpp>
 #include <migraphx/gpu/device/nary.hpp>
 #include <migraphx/gpu/device/shape.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 void rnn_var_sl_shift_sequence(hipStream_t stream,
                               const argument& result,
                               const argument& arg_hs,
                               const argument& arg_sl)
 {
    auto output_shape = result.get_shape();
    int64_t max_len   = output_shape.lens()[0];
    visit_all(result, arg_hs)([&](auto output, auto input) {
        const auto* in_data = device_cast(input.data());
        auto* out_data      = device_cast(output.data());
        auto out_s          = make_hip_shape<3>(output_shape);
        arg_sl.visit([&](auto sl) {
            const auto* sl_data = device_cast(sl.data());
            gs_launch(stream, output_shape.elements(), 256)([=](auto i) __device__ {
                auto idx = out_s.multi(i);
                auto t   = idx[0];
                auto b   = idx[1];
                auto l   = sl_data[b];
                auto val = in_data[0];
                val      = 0;
                if(t >= max_len - l)
                {
                    auto in_idx = idx;
                    in_idx[0] -= (max_len - l);
                    val = in_data[out_s.index(in_idx)];
                }
                out_data[i] = val;
            });
        });
    });
 }
 void rnn_var_sl_shift_output(hipStream_t stream,
                             const argument& result,
                             const argument& arg_hs,
                             const argument& arg_sl,
                             bool is_reverse)
 {
    auto output_shape = result.get_shape();
    int64_t max_len   = output_shape.lens()[0];
    visit_all(result, arg_hs)([&](auto output, auto input) {
        const auto* in_data = device_cast(input.data());
        auto* out_data      = device_cast(output.data());
        auto out_s          = make_hip_shape<4>(output_shape);
        arg_sl.visit([&](auto sl) {
            const auto* sl_data = device_cast(sl.data());
            gs_launch(stream, output_shape.elements(), 256)([=](auto i) __device__ {
                auto idx = out_s.multi(i);
                auto t   = idx[0];
                auto d   = idx[1];
                auto b   = idx[2];
                auto l   = sl_data[b];
                auto val = in_data[0];
                val      = 0;
                if(t < l)
                {
                    int offset  = (d == 1 or is_reverse) ? 1 : 0;
                    auto in_idx = idx;
                    in_idx[0] += offset * (max_len - l);
                    val = in_data[out_s.index(in_idx)];
                }
                out_data[i] = val;
            });
        });
    });
 }
 void rnn_var_sl_last_output(hipStream_t stream,
                            const argument& result,
                            const argument& arg_hs,
                            const argument& arg_sl,
                            bool is_reverse)
 {
    auto input_shape   = arg_hs.get_shape();
    auto out_comp_lens = input_shape.lens();
    out_comp_lens[0]   = 1;
    shape out_comp_shape{input_shape.type(), out_comp_lens};
    visit_all(result, arg_hs)([&](auto output, auto input) {
        const auto* in_data = device_cast(input.data());
        auto* out_data      = device_cast(output.data());
        arg_sl.visit([&](auto sl) {
            const auto* sl_data = device_cast(sl.data());
            auto in_s           = make_hip_shape<4>(input_shape);
            auto out_s          = make_hip_shape<4>(out_comp_shape);
            gs_launch(stream, result.get_shape().elements(), 256)([=](auto i) __device__ {
                auto idx = out_s.multi(i);
                auto d   = idx[1];
                auto b   = idx[2];
                auto l   = sl_data[b];
                if(is_reverse or d == 1)
                {
                    idx[0] = 0;
                }
                else
                {
                    idx[0] = l - 1;
                }
                out_data[i] = in_data[in_s.index(idx)];
            });
        });
    });
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/targets.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/targets.cpp
@ -0,0 +1,66 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device/targets.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/errors.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 static std::vector<std::string> parse_targets() { return split_string(MIGRAPHX_GPU_TARGETS, ';'); }
 const std::vector<std::string>& get_targets()
 {
    static auto result = parse_targets();
    return result;
 }
 std::string get_targets_as_string() { return join_strings(get_targets(), ", "); }
 static int get_device_id()
 {
    int device;
    auto status = hipGetDevice(&device);
    if(status != hipSuccess)
        MIGRAPHX_THROW("No device");
    return device;
 }
 std::string get_device_name()
 {
    hipDeviceProp_t props{};
    auto status = hipGetDeviceProperties(&props, get_device_id());
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to get device properties");
    return props.gcnArchName;
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device/targets.hpp.in
+++ b/docker/rocm/migraphx/targets/gpu/device/targets.hpp.in
@ -0,0 +1,52 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
 #define MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
 #include <migraphx/gpu/device/config.hpp>
 #include <string>
 #include <vector>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 #define MIGRAPHX_GPU_TARGETS "@GPU_TARGETS@" // NOLINT
 MIGRAPHX_DEVICE_EXPORT
 const std::vector<std::string>& get_targets();
 MIGRAPHX_DEVICE_EXPORT
 std::string get_targets_as_string();
 MIGRAPHX_DEVICE_EXPORT
 std::string get_device_name();
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
--- a/docker/rocm/migraphx/targets/gpu/device/topk.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device/topk.cpp
@ -0,0 +1,239 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/topk.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
 #include <migraphx/gpu/device/visit.hpp>
 #include <migraphx/ranges.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 template <class T, class Index, class Compare>
 struct hip_heap_vector
 {
    MIGRAPHX_DEVICE_CONSTEXPR hip_heap_vector(T* val, index_int n, Index v_idx, Compare comp)
        : data(val), size(n), data_index(v_idx), compare(comp)
    {
        make_heap(size);
    }
    MIGRAPHX_DEVICE_CONSTEXPR void try_push(const T val)
    {
        if(compare(val, data[data_index(0)]))
            return;
        pop_heap(size - 1);
        data[data_index(size - 1)] = val;
        push_heap(size - 1);
    }
    MIGRAPHX_DEVICE_CONSTEXPR void sort() { sort_heap(size); }
    private:
    MIGRAPHX_DEVICE_CONSTEXPR inline static void swap(T& v1, T& v2) noexcept
    {
        T v = v1;
        v1  = v2;
        v2  = v;
    }
    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_down(index_int n, index_int index)
    {
        while(index < n)
        {
            auto pre_index = index;
            index_int l    = 2 * index + 1;
            index_int r    = 2 * index + 2;
            if(l < n and compare(data[data_index(l)], data[data_index(index)]))
            {
                index = l;
            }
            if(r < n and compare(data[data_index(r)], data[data_index(index)]))
            {
                index = r;
                if(compare(data[data_index(l)], data[data_index(r)]))
                {
                    index = l;
                }
            }
            if(index == pre_index)
            {
                break;
            }
            swap(data[data_index(index)], data[data_index(pre_index)]);
        }
    }
    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_up(index_int index)
    {
        while(index > 0)
        {
            auto parent_idx = (index - 1) / 2;
            if(not compare(data[data_index(index)], data[data_index(parent_idx)]))
            {
                break;
            }
            swap(data[data_index(index)], data[data_index(parent_idx)]);
            index = parent_idx;
        }
    }
    MIGRAPHX_DEVICE_CONSTEXPR inline void make_heap(index_int n)
    {
        for(int j = n / 2 - 1; j >= 0; --j)
        {
            heapify_down(n, j);
        }
    }
    MIGRAPHX_DEVICE_CONSTEXPR inline void push_heap(index_int loc) { heapify_up(loc); }
    MIGRAPHX_DEVICE_CONSTEXPR inline void pop_heap(index_int loc)
    {
        swap(data[data_index(0)], data[data_index(loc)]);
        heapify_down(loc, 0);
    }
    MIGRAPHX_DEVICE_CONSTEXPR inline void sort_heap(index_int n)
    {
        for(int j = n - 1; j > 0; --j)
        {
            swap(data[data_index(0)], data[data_index(j)]);
            heapify_down(j, 0);
        }
    }
    T* data = nullptr;
    index_int size;
    Index data_index;
    Compare compare;
 };
 template <class T, class Index, class Compare>
 __device__ hip_heap_vector<T, Index, Compare>
 make_heap(T* data, index_int n, Index idx, Compare compare)
 {
    return {data, n, idx, compare};
 }
 template <class Compare>
 std::vector<argument> topk(hipStream_t stream,
                           const argument& val_res,
                           const argument& ind_res,
                           const argument& arg,
                           int64_t k,
                           int64_t axis,
                           Compare compare)
 {
    auto in_s       = arg.get_shape();
    auto in_lens    = in_s.lens();
    auto out_s      = val_res.get_shape();
    auto axis_dim   = in_s.lens()[axis];
    auto comp_lens  = in_lens;
    comp_lens[axis] = 1;
    shape comp_s{in_s.type(), comp_lens};
    std::size_t elem_num = comp_s.elements();
    hip_visit_all(val_res, arg, out_s, in_s, comp_s)(
        [&](auto out_val, auto input, auto oss, auto iss, auto css) {
            auto* data      = device_cast(input.data());
            auto* out       = device_cast(out_val.data());
            auto* const ind = ind_res.cast<int64_t>();
            gs_launch(stream, elem_num)([=](auto i) __device__ {
                auto idx = css.multi(i);
                auto in_idx = [&](int ii) {
                    auto iidx  = idx;
                    iidx[axis] = ii;
                    return iss.index(iidx);
                };
                auto out_idx = [&](int ii) {
                    auto iidx  = idx;
                    iidx[axis] = ii;
                    return oss.index(iidx);
                };
                auto data_compare = [=](auto ii, auto jj) {
                    return compare(data[in_idx(ii)], data[in_idx(jj)]);
                };
                for(int j = 0; j < k; ++j)
                {
                    ind[out_idx(j)] = j;
                }
                auto hp = make_heap(ind, k, out_idx, data_compare);
                for(int j = k; j < axis_dim; ++j)
                {
                    hp.try_push(j);
                }
                hp.sort();
                for(int j = 0; j < k; ++j)
                {
                    out[out_idx(j)] = data[in_idx(ind[out_idx(j)])];
                }
            });
        });
    return {val_res, ind_res};
 }
 argument topk_largest(hipStream_t stream,
                      const argument& val_res,
                      const argument& ind_res,
                      const argument& arg,
                      int64_t k,
                      int64_t axis)
 {
    return {topk(stream, val_res, ind_res, arg, k, axis, std::less<>{})};
 }
 argument topk_smallest(hipStream_t stream,
                       const argument& val_res,
                       const argument& ind_res,
                       const argument& arg,
                       int64_t k,
                       int64_t axis)
 {
    return {topk(stream, val_res, ind_res, arg, k, axis, std::greater<>{})};
 }
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/device_name.cpp
+++ b/docker/rocm/migraphx/targets/gpu/device_name.cpp
@ -0,0 +1,68 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/rank.hpp>
 #include <migraphx/stringutils.hpp>
 #include <hip/hip_runtime_api.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 int get_device_id()
 {
    int device;
    auto status = hipGetDevice(&device);
    if(status != hipSuccess)
        MIGRAPHX_THROW("No device");
    return device;
 }
 std::string get_device_name()
 {
    hipDeviceProp_t props{};
    auto status = hipGetDeviceProperties(&props, get_device_id());
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to get device properties");
    return props.gcnArchName;
 }
 bool gfx_has_fp8fnuz_intrinsics()
 {
    const auto device_name = trim(split_string(get_device_name(), ':').front());
    return (starts_with(device_name, "gfx94"));
 }
 bool gfx_has_fp8ocp_intrinsics()
 {
    const auto device_name = trim(split_string(get_device_name(), ':').front());
    bool is_navi_with_fp8ocp = starts_with(device_name, "gfx12") and device_name >= "gfx1200";
    bool is_mi_with_fp8ocp   = starts_with(device_name, "gfx9") and device_name >= "gfx950";
    return (is_navi_with_fp8ocp or is_mi_with_fp8ocp);
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/driver/CMakeLists.txt
+++ b/docker/rocm/migraphx/targets/gpu/driver/CMakeLists.txt
@ -0,0 +1,31 @@
 #####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
 file(GLOB GPU_DRIVER_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
    ${GPU_DRIVER_SRCS}
 )
 rocm_clang_tidy_check(gpu-driver)
 target_include_directories(gpu-driver PRIVATE include)
 target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
--- a/docker/rocm/migraphx/targets/gpu/driver/action.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/action.cpp
@ -0,0 +1,50 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/errors.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 auto& action_map()
 {
    static std::unordered_map<std::string, action_function> m;
    return m;
 }
 action_function get_action(const std::string& name)
 {
    if(action_map().count(name) == 0)
        MIGRAPHX_THROW("Missing action: " + name);
    return action_map().at(name);
 }
 void register_action(const std::string& name, const action_function& a) { action_map()[name] = a; }
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/driver/compile_op.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/compile_op.cpp
@ -0,0 +1,50 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 struct compile_op : action<compile_op>
 {
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
        auto t      = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << op << " -> " << op.compute_shape(inputs) << ": " << t << "ms" << std::endl;
        std::cout << std::endl;
    }
 };
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
@ -0,0 +1,60 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
 #define MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/auto_register.hpp>
 #include <migraphx/type_name.hpp>
 #include <migraphx/gpu/driver/parser.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 using action_function = std::function<void(const parser&, const value&)>;
 action_function get_action(const std::string& name);
 void register_action(const std::string& name, const action_function& a);
 struct auto_register_action
 {
    template <class T>
    static void apply()
    {
        const auto& name = get_type_name<T>();
        register_action(name.substr(name.rfind("::") + 2),
                        [](auto&&... xs) { T::apply(std::forward<decltype(xs)>(xs)...); });
    }
 };
 template <class T>
 using action = auto_register<auto_register_action, T>;
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
--- a/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/parser.hpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/parser.hpp
@ -0,0 +1,68 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #ifndef MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
 #define MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
 #include <migraphx/value.hpp>
 #include <migraphx/shape.hpp>
 #include <unordered_map>
 #include <functional>
 #include <vector>
 #include <string>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 [[noreturn]] void error(const std::string& msg);
 struct parser
 {
    parser() = default;
    template <class T>
    T get(const value& v, const std::string& key, const T& default_value) const
    {
        return v.get(key, settings.get(key, default_value));
    }
    shape parse_shape(const value& v) const;
    std::vector<shape> parse_shapes(const value& v) const;
    void load_settings(const value& v);
    static void process(const value& v);
    private:
    value settings = value::object{};
 };
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
--- a/docker/rocm/migraphx/targets/gpu/driver/main.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/main.cpp
@ -0,0 +1,44 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/parser.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/convert_to_json.hpp>
 #include <migraphx/file_buffer.hpp>
 #include <iostream>
 using namespace migraphx;              // NOLINT
 using namespace migraphx::gpu;         // NOLINT
 using namespace migraphx::gpu::driver; // NOLINT
 int main(int argc, char const* argv[])
 {
    std::vector<std::string> args(argv, argv + argc);
    if(args.size() < 2)
    {
        std::cout << "Usage: gpu-driver <input-file>" << std::endl;
        std::abort();
    }
    auto v = from_json_string(convert_to_json(read_string(args[1])));
    parser::process(v);
 }
--- a/docker/rocm/migraphx/targets/gpu/driver/parser.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/parser.cpp
@ -0,0 +1,81 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/parser.hpp>
 #include <migraphx/gpu/driver/action.hpp>
 #include <iostream>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 [[noreturn]] void error(const std::string& msg)
 {
    std::cout << msg << std::endl;
    std::abort();
 }
 shape parser::parse_shape(const value& v) const
 {
    auto lens    = get(v, "lens", std::vector<std::size_t>{});
    auto strides = get(v, "strides", std::vector<std::size_t>{});
    auto type    = shape::parse_type(get<std::string>(v, "type", "float"));
    if(strides.empty())
        return shape{type, lens};
    else
        return shape{type, lens, strides};
 }
 std::vector<shape> parser::parse_shapes(const value& v) const
 {
    std::vector<shape> result;
    std::transform(
        v.begin(), v.end(), std::back_inserter(result), [&](auto&& x) { return parse_shape(x); });
    return result;
 }
 void parser::load_settings(const value& v)
 {
    if(v.contains("settings"))
        settings = v.at("settings");
 }
 void parser::process(const value& v)
 {
    if(not v.is_object())
        error("Input is not an object");
    parser p{};
    p.load_settings(v);
    for(auto&& pp : v)
    {
        if(pp.get_key() == "settings")
            continue;
        get_action(pp.get_key())(p, pp.without_key());
    }
 }
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/driver/precompile_op.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/precompile_op.cpp
@ -0,0 +1,84 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 struct precompile_op : action<precompile_op>
 {
    static program create_preop_program(const operation& preop, std::vector<shape> inputs)
    {
        program p;
        auto* mm = p.get_main_module();
        std::vector<instruction_ref> args;
        inputs.pop_back();
        transform(inputs, range(inputs.size()), std::back_inserter(args), [&](auto input, auto i) {
            return mm->add_parameter("x" + std::to_string(i), input);
        });
        mm->add_instruction(preop, args);
        return p;
    }
    static operation get_code_object(const program& p)
    {
        MIGRAPHX_TIDY_CONST auto* mm = p.get_main_module();
        auto it                      = std::find_if(mm->begin(), mm->end(), [](const auto& ins) {
            return (ins.name() == "gpu::code_object");
        });
        if(it == mm->end())
            MIGRAPHX_THROW("Failed to create code object");
        return it->get_operator();
    }
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto name   = v.at("name").to<std::string>();
        auto preop  = make_op(name);
        if(v.contains("fields"))
            preop.from_value(v.at("fields"));
        bool exhaustive = v.get("exhaustive", false);
        auto prog       = create_preop_program(preop, inputs);
        run_passes(prog, {lowering{}, compile_ops{&ctx, exhaustive}});
        auto op = get_code_object(prog);
        auto t  = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << preop << ": " << t << "ms" << std::endl;
    }
 };
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/docker/rocm/migraphx/targets/gpu/driver/run_op.cpp
+++ b/docker/rocm/migraphx/targets/gpu/driver/run_op.cpp
@ -0,0 +1,54 @@
 /*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/make_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
 struct run_op : action<run_op>
 {
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto name   = v.at("name").to<std::string>();
        if(not contains(name, "::"))
            name = "gpu::" + name;
        auto op = make_op(name);
        if(v.contains("fields"))
            op.from_value(v.at("fields"));
        auto t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << op << " -> " << op.compute_shape(inputs) << ": " << t << "ms" << std::endl;
    }
 };
 } // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/Show More
+++ b/Show More