diff --git a/docker/rocm/migraphx/targets/cpu/CMakeLists.txt b/docker/rocm/migraphx/targets/cpu/CMakeLists.txt
new file mode 100644
index 000000000..558e35387
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/CMakeLists.txt
@@ -0,0 +1,105 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+include(CheckCXXCompilerFlag)
+
+add_library(migraphx_cpu
+    allocate.cpp
+    allocation_model.cpp
+    binary.cpp
+    concat.cpp
+    convolution.cpp
+    copy.cpp
+    deconvolution.cpp
+    dnnl.cpp
+    eltwise.cpp
+    erf.cpp
+    fmod.cpp
+    fuse_ops.cpp
+    gather.cpp
+    gemm.cpp
+    layernorm.cpp
+    logsoftmax.cpp
+    lowering.cpp
+    lrn.cpp
+    mod.cpp
+    preallocate.cpp
+    pooling.cpp
+    reduction.cpp
+    reorder.cpp
+    softmax.cpp
+    sub.cpp
+    target.cpp
+    write_literals.cpp
+)
+set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
+rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
+
+set(MIGRAPHX_ENABLE_ZENDNN Off CACHE BOOL "")
+
+if(MIGRAPHX_ENABLE_ZENDNN)
+    find_path(ZENDNN_INC_PATH zendnn.hpp)
+    find_library(ZENDNN_LIB amdZenDNN)
+    find_library(BLIS_LIB blis)
+else()
+    find_package(dnnl REQUIRED)
+endif()
+
+rocm_clang_tidy_check(migraphx_cpu)
+if(MIGRAPHX_ENABLE_ZENDNN)
+    target_compile_definitions(migraphx_cpu PRIVATE -DMIGRAPHX_ENABLE_ZENDNN)
+    target_include_directories(migraphx_cpu PRIVATE ${ZENDNN_INC_PATH})
+    message(STATUS "ZENDNN_LIB: ${ZENDNN_LIB}")
+    target_link_libraries(migraphx_cpu PRIVATE ${BLIS_LIB})
+    target_link_libraries(migraphx_cpu PRIVATE ${ZENDNN_LIB})
+else()
+    target_link_libraries(migraphx_cpu PUBLIC DNNL::dnnl)
+endif()
+target_link_libraries(migraphx_cpu PRIVATE migraphx)
+
+migraphx_generate_export_header(migraphx_cpu)
+
+find_package(OpenMP)
+if(WIN32)
+    target_link_libraries(migraphx_cpu PUBLIC libomp)
+    target_include_directories(migraphx_cpu PUBLIC ${OpenMP_CXX_INCLUDE_DIRS})
+    target_compile_options(migraphx_cpu PUBLIC ${OpenMP_CXX_FLAGS})
+else()
+    target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
+    # Add library path to rpath to workaround issues with our broken packages
+    foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
+        if(LIBRARY MATCHES "libomp")
+            get_filename_component(LIBRARY_PATH "${LIBRARY}" PATH)
+            target_link_libraries(migraphx_cpu PUBLIC -Wl,-rpath=${LIBRARY_PATH} -Wl,-rpath-link=${LIBRARY_PATH})
+        endif()
+    endforeach()
+endif()
+
+rocm_install_targets(
+  PRIVATE
+  TARGETS migraphx_cpu
+  INCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
diff --git a/docker/rocm/migraphx/targets/cpu/allocate.cpp b/docker/rocm/migraphx/targets/cpu/allocate.cpp
new file mode 100644
index 000000000..938139c9b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/allocate.cpp
@@ -0,0 +1,60 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_allocate : auto_register_op<cpu_allocate>
+{
+    shape s;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
+
+    std::string name() const { return "cpu::allocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
+    {
+        argument result{output_shape};
+        return result;
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/allocation_model.cpp b/docker/rocm/migraphx/targets/cpu/allocation_model.cpp
new file mode 100644
index 000000000..bd6833fb9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/allocation_model.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/cpu/allocation_model.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+std::string cpu_allocation_model::name() const { return "cpu::allocate"; }
+operation cpu_allocation_model::allocate(const shape& s) const
+{
+    return make_op(name(), {{"shape", to_value(s)}});
+}
+
+operation cpu_allocation_model::preallocate(const shape& s, const std::string& id) const
+{
+    return make_op("cpu::preallocate", {{"shape", to_value(s)}, {"id", id}});
+}
+
+std::string cpu_allocation_model::copy() const { return "cpu::copy"; }
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/binary.cpp b/docker/rocm/migraphx/targets/cpu/binary.cpp
new file mode 100644
index 000000000..e663f50e7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/binary.cpp
@@ -0,0 +1,83 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
+{
+    std::string algo;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(self.reflect_base(self, f), pack(f(self.algo, "algo")));
+    }
+
+    std::string group() const { return this->name() + "::" + algo; }
+
+    std::string name() const { return "dnnl::binary"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(2);
+        auto s0 = inputs.at(0);
+        auto s1 = inputs.at(1);
+        auto r  = s0;
+        if(s0 != s1 or not s0.packed())
+        {
+            if(s0.packed() != s1.packed())
+            {
+                r = s0.packed() ? s0 : s1;
+            }
+            else if(s0.broadcasted() != s1.broadcasted())
+            {
+                r = s0.broadcasted() ? s1.with_lens(s0.lens()) : s0.with_lens(s0.lens());
+            }
+            else
+            {
+                r = {s0.type(), s0.lens()};
+            }
+        }
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+
+    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {to_dnnl_algo(algo),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_1)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/concat.cpp b/docker/rocm/migraphx/targets/cpu/concat.cpp
new file mode 100644
index 000000000..0c7cdc954
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/concat.cpp
@@ -0,0 +1,67 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/concat.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_concat : dnnl_extend_op<dnnl_concat, dnnl::concat, op::concat>
+{
+    std::vector<int> arg_map(int size) const
+    {
+        std::vector<int> result(size);
+        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_MULTIPLE_SRC));
+        return result;
+    }
+    // Custom desc class since its missing in dnnl
+    struct desc
+    {
+        dnnl::memory::desc dst;
+        std::size_t axis = 1;
+        std::vector<dnnl::memory::desc> srcs;
+    };
+    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        std::vector<dnnl::memory::desc> srcs;
+        srcs.reserve(m.size() - 1);
+
+        for(auto i = 0; i < m.size() - 1; i++)
+        {
+            srcs.push_back(m.at(MIGRAPHX_DNNL_PREFIX(ARG_MULTIPLE_SRC) + i));
+        }
+        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)), std::size_t(op.axis), srcs};
+    }
+
+    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const
+    {
+        return dnnl::concat::primitive_desc(d.dst, d.axis, d.srcs, get_dnnl_context().engine, attr);
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/convolution.cpp b/docker/rocm/migraphx/targets/cpu/convolution.cpp
new file mode 100644
index 000000000..42e533003
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/convolution.cpp
@@ -0,0 +1,86 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_convolution
+    : dnnl_extend_op<dnnl_convolution, dnnl::convolution_forward, op::convolution>
+{
+    std::vector<int> arg_map(int) const
+    {
+        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
+    }
+
+    shape adjust_shape(const shape& x, int i, const shape& output) const
+    {
+        auto s = base_adjust_shape(x, output);
+        if(i == 1 and op.group > 1)
+        {
+            // TODO: Add support for transposed weights
+            if(not s.standard())
+                MIGRAPHX_THROW("Weights for grouped convolution must be standard");
+            auto lens = s.lens();
+            lens.insert(lens.begin(), op.group);
+            lens.at(1) /= op.group;
+            return shape{s.type(), lens};
+        }
+        return s;
+    }
+
+    dnnl::convolution_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        // In DNNL dilation is zero-based
+        auto dilation = op.dilation;
+        std::transform(
+            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
+        auto kdims = op.kdims();
+        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
+        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::convolution_auto,
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(dilation),
+                to_dnnl_dims(padding_l),
+                to_dnnl_dims(padding_r)};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/copy.cpp b/docker/rocm/migraphx/targets/cpu/copy.cpp
new file mode 100644
index 000000000..4c4af2b71
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/copy.cpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy>
+{
+    template <class Self, class F>
+    static auto reflect(Self&, F)
+    {
+        return pack();
+    }
+
+    std::string name() const { return "cpu::copy"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return inputs.at(1);
+    }
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+
+        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
+            pointwise(output, input)(ctx, output.get_shape(), 1024, [](auto& y, auto x) { y = x; });
+        });
+
+        return result.reshape(output_shape);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/deconvolution.cpp b/docker/rocm/migraphx/targets/cpu/deconvolution.cpp
new file mode 100644
index 000000000..3398036e1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/deconvolution.cpp
@@ -0,0 +1,76 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_deconvolution
+    : dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::convolution_backwards>
+{
+    std::vector<int> arg_map(int) const
+    {
+        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
+    }
+
+    shape adjust_shape(const shape& x, int i, const shape& output) const
+    {
+        auto s = base_adjust_shape(x, output);
+        if(i == 1)
+        {
+            // The input and output channels are flipped for dnnl
+            auto lens = s.lens();
+            std::swap(lens[0], lens[1]);
+            auto strides = s.strides();
+            std::swap(strides[0], strides[1]);
+            return {s.type(), lens, strides};
+        }
+        return s;
+    }
+
+    dnnl::deconvolution_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        // In DNNL dilation is zero-based
+        auto dilation = op.dilation;
+        std::transform(
+            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::deconvolution_direct,
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(dilation),
+                to_dnnl_dims(op.padding),
+                to_dnnl_dims(op.padding)};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/dnnl.cpp b/docker/rocm/migraphx/targets/cpu/dnnl.cpp
new file mode 100644
index 000000000..dc252cdfe
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/dnnl.cpp
@@ -0,0 +1,205 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/cpu/dnnl.hpp>
+
+#if defined(__GNUC__) && __GNUC__ <= 5
+namespace std {
+#ifdef MIGRAPHX_ENABLE_ZENDNN
+namespace dnnl = zendnn;
+#endif
+template <>
+struct hash<dnnl::algorithm>
+{
+    using argument_type = dnnl::algorithm;
+    using result_type   = std::size_t;
+    result_type operator()(const argument_type& x) const noexcept
+    {
+        return std::hash<underlying_type_t<argument_type>>{}(
+            static_cast<underlying_type_t<argument_type>>(x));
+    }
+};
+
+} // namespace std
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+dnnl_context& get_dnnl_context()
+{
+    static dnnl_context ctx{}; // NOLINT
+    return ctx;
+}
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
+{
+    using dt = dnnl::memory::data_type;
+    using st = shape::type_t;
+    switch(t)
+    {
+    case st::half_type: return dt::f16;
+    case st::float_type: return dt::f32;
+    case st::int32_type: return dt::s32;
+    case st::int8_type: return dt::s8;
+    case st::uint8_type: return dt::u8;
+    case st::fp8e4m3fnuz_type: MIGRAPHX_THROW("fp8e4m3fnuz unsupported in DNNL");
+    default: MIGRAPHX_THROW("Unsupported data type");
+    }
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
+{
+    switch(n)
+    {
+    case 1: return dnnl::memory::format_tag::a;
+    case 2: return dnnl::memory::format_tag::ab;
+    case 3: return dnnl::memory::format_tag::abc;
+    case 4: return dnnl::memory::format_tag::abcd;
+    case 5: return dnnl::memory::format_tag::abcde;
+    case 6: return dnnl::memory::format_tag::abcdef;
+    default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
+    }
+}
+
+dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
+{
+    return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
+}
+
+dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
+{
+    return {desc, get_dnnl_context().engine, a.data()};
+}
+
+dnnl::memory to_dnnl_memory(const argument& a)
+{
+    return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
+}
+
+// clang-format off
+#define MIGRAPHX_VISIT_DNNL_ALGO(m) \
+        m(undef) \
+        m(convolution_auto) \
+        m(convolution_direct) \
+        m(convolution_winograd) \
+        m(deconvolution_direct) \
+        m(deconvolution_winograd) \
+        m(eltwise_relu) \
+        m(eltwise_tanh) \
+        m(eltwise_elu) \
+        m(eltwise_square) \
+        m(eltwise_abs) \
+        m(eltwise_sqrt) \
+        m(eltwise_swish) \
+        m(eltwise_linear) \
+        m(eltwise_bounded_relu) \
+        m(eltwise_soft_relu) \
+        m(eltwise_logistic) \
+        m(eltwise_exp) \
+        m(eltwise_gelu) \
+        m(eltwise_gelu_tanh) \
+        m(eltwise_gelu_erf) \
+        m(eltwise_log) \
+        m(eltwise_clip) \
+        m(eltwise_pow) \
+        m(eltwise_round) \
+        m(eltwise_relu_use_dst_for_bwd) \
+        m(eltwise_tanh_use_dst_for_bwd) \
+        m(eltwise_elu_use_dst_for_bwd) \
+        m(eltwise_sqrt_use_dst_for_bwd) \
+        m(eltwise_logistic_use_dst_for_bwd) \
+        m(eltwise_exp_use_dst_for_bwd) \
+        m(lrn_across_channels) \
+        m(lrn_within_channel) \
+        m(pooling_max) \
+        m(pooling_avg) \
+        m(pooling_avg_include_padding) \
+        m(pooling_avg_exclude_padding) \
+        m(vanilla_rnn) \
+        m(vanilla_lstm) \
+        m(vanilla_gru) \
+        m(lbr_gru) \
+        m(binary_add) \
+        m(binary_mul) \
+        m(binary_max) \
+        m(binary_min) \
+        m(binary_div) \
+        m(resampling_nearest) \
+        m(resampling_linear) \
+        m(reduction_max) \
+        m(reduction_min) \
+        m(reduction_sum) \
+        m(reduction_mul) \
+        m(reduction_mean) \
+        m(reduction_norm_lp_max) \
+        m(reduction_norm_lp_sum) \
+        m(reduction_norm_lp_power_p_max) \
+        m(reduction_norm_lp_power_p_sum)
+// clang-format on
+
+const std::unordered_map<std::string, dnnl::algorithm>& dnnl_algo_map()
+{
+    static const std::unordered_map<std::string, dnnl::algorithm> m = {
+#define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {#x, dnnl::algorithm::x},
+        MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
+#undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
+    };
+    return m;
+}
+
+dnnl::algorithm to_dnnl_algo(const std::string& name)
+{
+    if(dnnl_algo_map().count(name) == 0)
+        MIGRAPHX_THROW("Missing dnnl algo: " + name);
+    return dnnl_algo_map().at(name);
+}
+
+const std::unordered_map<dnnl::algorithm, std::string>& dnnl_algo_string_map()
+{
+    static const std::unordered_map<dnnl::algorithm, std::string> m = {
+#define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {dnnl::algorithm::x, #x},
+        MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
+#undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
+    };
+    return m;
+}
+
+std::string to_string(const dnnl::algorithm& algo)
+{
+    if(dnnl_algo_string_map().count(algo) == 0)
+        return "unknown_" + std::to_string(static_cast<int>(algo));
+    return dnnl_algo_string_map().at(algo);
+}
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/eltwise.cpp b/docker/rocm/migraphx/targets/cpu/eltwise.cpp
new file mode 100644
index 000000000..5b328cb7e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/eltwise.cpp
@@ -0,0 +1,73 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_eltwise : dnnl_op<dnnl_eltwise, dnnl::eltwise_forward>
+{
+    std::string algo;
+    float alpha = 0;
+    float beta  = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(self.reflect_base(self, f),
+                         pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta")));
+    }
+
+    std::string group() const { return this->name() + "::" + algo; }
+
+    std::string name() const { return "dnnl::eltwise"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1).packed();
+        auto s = inputs.at(0);
+        auto r = s;
+        if(not s.packed())
+            r = shape{s.type(), s.lens()};
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+
+    dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::prop_kind::forward_inference,
+                to_dnnl_algo(algo),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
+                alpha,
+                beta};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/erf.cpp b/docker/rocm/migraphx/targets/cpu/erf.cpp
new file mode 100644
index 000000000..9fa34b4fa
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/erf.cpp
@@ -0,0 +1,36 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/erf.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+template struct cpu_unary<op::erf>;
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/fmod.cpp b/docker/rocm/migraphx/targets/cpu/fmod.cpp
new file mode 100644
index 000000000..ade453147
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/fmod.cpp
@@ -0,0 +1,36 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+template struct cpu_binary<op::fmod>;
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/fuse_ops.cpp b/docker/rocm/migraphx/targets/cpu/fuse_ops.cpp
new file mode 100644
index 000000000..a4f8fe78f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/fuse_ops.cpp
@@ -0,0 +1,134 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/cpu/fuse_ops.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND);
+
+MIGRAPHX_PRED_MATCHER(has_post_ops, instruction_ref ins)
+{
+    auto v = ins->get_operator().to_value();
+    return v.contains("post_ops");
+}
+
+MIGRAPHX_PRED_MATCHER(without_post_ops, instruction_ref ins)
+{
+    auto v = ins->get_operator().to_value();
+    return v.contains("post_ops") and v["post_ops"].empty();
+}
+
+bool workaround_dnnl_broken_post_ops(const operation& op, const operation& post_op)
+{
+    if(contains({"dnnl::dot", "dnnl::convolution"}, op.name()))
+        return true;
+    auto pv = post_op.to_value();
+    if(not pv.at("post_ops").empty())
+        return true;
+    auto v         = op.to_value();
+    auto last_op   = v.at("post_ops").empty() ? v : v.at("post_ops").back();
+    auto algo      = last_op.contains("algo") ? last_op.at("algo").to<std::string>() : op.name();
+    auto post_algo = pv["algo"].to<std::string>();
+    if(starts_with(algo, "eltwise") and starts_with(post_algo, "eltwise"))
+        return true;
+    if(algo == post_algo)
+        return true;
+    return false;
+}
+
+operation merge_post_ops(const operation& op, const operation& post_op)
+{
+    auto pv = post_op.to_value();
+    auto v  = op.to_value();
+    v["post_ops"].push_back({{"algo", pv["algo"]},
+                             {"alpha", pv["alpha"].value_or(0.0f)},
+                             {"beta", pv["beta"].value_or(0.0f)}});
+    auto post_ops = pv.at("post_ops");
+    for(const auto& po : post_ops)
+        v["post_ops"].push_back(po);
+    return make_op(op.name(), v);
+}
+
+struct find_post_ops
+{
+    context* ctx = nullptr;
+    match::any_matcher matcher() const
+    {
+        if(enabled(MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND{}))
+            return match::name("dnnl::eltwise",
+                               "dnnl::binary")(match::arg(0)(has_post_ops(), match::used_once()));
+        else
+        {
+            auto dnnl_binary = match::name("dnnl::binary")(without_post_ops(), match::used_once());
+            return match::name("dnnl::eltwise")(without_post_ops(), match::arg(0)(dnnl_binary));
+        }
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = ins->inputs().front();
+        auto x     = x_ins->get_operator();
+
+        if(workaround_dnnl_broken_post_ops(x, ins->get_operator()))
+            return;
+
+        auto op       = merge_post_ops(x, ins->get_operator());
+        auto inputs   = x_ins->inputs();
+        inputs.back() = ins->inputs().back();
+        if(ins->name() == "dnnl::binary")
+            inputs.insert(std::prev(inputs.end()), ins->inputs().at(1));
+        auto input_shapes = to_shapes(inputs);
+        auto new_shape    = try_compute_shape(op, input_shapes);
+        if(new_shape.empty() or new_shape.front() != ins->get_shape())
+            return;
+        auto info = compile(op, *ctx, new_shape.front(), input_shapes);
+        if(info.contains("impl") and starts_with(info.at("impl").to<std::string>(), "ref:"))
+            return;
+        m.replace_instruction(ins, op, inputs);
+    }
+};
+
+void fuse_ops::apply(module& m) const
+{
+    for(std::size_t i = 0; i < 4; i++)
+    {
+        match::find_matches(m, find_post_ops{ctx});
+        dead_code_elimination{}.apply(m);
+    }
+}
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/gather.cpp b/docker/rocm/migraphx/targets/cpu/gather.cpp
new file mode 100644
index 000000000..40bc556b9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/gather.cpp
@@ -0,0 +1,88 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/op/gather.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_gather : auto_register_op<cpu_gather>
+{
+    op::gather op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes(inputs, *this).standard();
+        return migraphx::compute_shape(op, inputs);
+    }
+
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        std::size_t nelements = output_shape.elements();
+        auto lens             = args[0].get_shape().lens();
+        auto axis_dim_size    = lens[op.axis];
+        lens[op.axis]         = args[1].get_shape().elements();
+        shape out_comp{output_shape.type(), lens};
+
+        visit_all(args.back(), args[0])([&](auto output, auto input) {
+            args[1].visit([&](auto indices) {
+                const auto* indices_ptr = indices.data();
+                auto* output_ptr        = output.data();
+                ctx.bulk_execute(nelements, 1024, [=](auto start, auto end) {
+                    for(auto i = start; i < end; i++)
+                    {
+                        auto idx      = out_comp.multi(i);
+                        auto in_index = indices_ptr[idx[op.axis]];
+                        in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;
+                        idx[op.axis]  = in_index;
+                        output_ptr[i] = input(idx.begin(), idx.end());
+                    }
+                });
+            });
+        });
+
+        return args.back();
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/gemm.cpp b/docker/rocm/migraphx/targets/cpu/gemm.cpp
new file mode 100644
index 000000000..50f42d5fe
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/gemm.cpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
+{
+    std::vector<int> arg_map(int) const
+    {
+        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC),
+                MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS),
+                MIGRAPHX_DNNL_PREFIX(ARG_BIAS)};
+    }
+
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.not_broadcasted();
+    }
+
+    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/allocation_model.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/allocation_model.hpp
new file mode 100644
index 000000000..4ee101331
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/allocation_model.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_allocation_model
+{
+    std::string name() const;
+    std::string copy() const;
+    operation allocate(const shape& s) const;
+    operation preallocate(const shape& s, const std::string& id) const;
+    bool needs_out_params() const { return false; }
+};
+
+} // namespace cpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/context.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/context.hpp
new file mode 100644
index 000000000..461dbcb39
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/context.hpp
@@ -0,0 +1,58 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/cpu/parallel.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/cpu/export.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct context
+{
+    void finish() const {}
+
+    template <class F>
+    void bulk_execute(std::size_t n, std::size_t min_grain, F f)
+    {
+        cpu::parallel_for(n, min_grain, f);
+    }
+
+    template <class F>
+    void bulk_execute(std::size_t n, F f)
+    {
+        this->bulk_execute(n, 256, f);
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/dnnl.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/dnnl.hpp
new file mode 100644
index 000000000..b05cad852
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -0,0 +1,441 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <unordered_map>
+#include <migraphx/errors.hpp>
+#include <migraphx/assert.hpp>
+#ifdef MIGRAPHX_ENABLE_ZENDNN
+#include <zendnn.hpp>
+#else
+#include <dnnl.hpp>
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+#ifdef MIGRAPHX_ENABLE_ZENDNN
+namespace dnnl = zendnn;
+#define MIGRAPHX_CONCAT_PREFIX(b) ZENDNN_##b // NOLINT
+#else
+#define MIGRAPHX_CONCAT_PREFIX(b) DNNL_##b // NOLINT
+#endif
+#define MIGRAPHX_DNNL_PREFIX(b) MIGRAPHX_CONCAT_PREFIX(b) // NOLINT
+
+struct dnnl_context
+{
+    dnnl::engine engine;
+    dnnl::stream stream;
+    dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
+};
+
+dnnl_context& get_dnnl_context();
+
+dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
+
+dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
+
+template <class R>
+inline dnnl::memory::dims to_dnnl_dims(R&& r)
+{
+    return {r.begin(), r.end()};
+}
+
+dnnl::memory::desc to_dnnl_memory_desc(const shape& s);
+
+dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a);
+
+dnnl::memory to_dnnl_memory(const argument& a);
+
+dnnl::algorithm to_dnnl_algo(const std::string& name);
+
+std::string to_string(const dnnl::algorithm& algo);
+
+struct post_op : reflect_equality<post_op>, reflect_stream<post_op>
+{
+    std::string algo;
+    float alpha = 0;
+    float beta  = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta"));
+    }
+};
+
+template <class F>
+struct execute_wrapper
+{
+    F f;
+    argument operator()(context&, const std::vector<argument>& args) const { return f(args); }
+};
+
+template <class F>
+execute_wrapper<F> make_execute_wrapper(F f)
+{
+    return {std::move(f)};
+}
+
+template <class Derived, class Primitive>
+struct dnnl_op : auto_register_op<Derived>
+{
+    std::vector<post_op> post_ops;
+    std::function<argument(context& ctx, const std::vector<argument>& args)> execute;
+
+    template <class Self, class F>
+    static auto reflect_base(Self& self, F f)
+    {
+        return pack(f(self.post_ops, "post_ops"));
+    }
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return reflect_base(self, f);
+    }
+
+    std::string group() const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        return self.name();
+    }
+
+    value attributes() const
+    {
+        std::vector<std::string> names;
+        std::transform(post_ops.begin(), post_ops.end(), std::back_inserter(names), [](auto&& op) {
+            return op.algo;
+        });
+        const auto& self = static_cast<const Derived&>(*this);
+        auto g           = self.group();
+        if(not names.empty())
+            g += "<" + join_strings(names, ",") + ">";
+        return {{"group", g}};
+    }
+
+    std::size_t get_extra_post_op_args() const
+    {
+        return std::count_if(post_ops.begin(), post_ops.end(), [](const auto& po) {
+            return contains(po.algo, "binary");
+        });
+    }
+
+    static std::size_t get_binary_post_op_arg(std::size_t pos)
+    {
+        return MIGRAPHX_DNNL_PREFIX(ARG_ATTR_MULTIPLE_POST_OP)(pos) | // NOLINT
+               MIGRAPHX_DNNL_PREFIX(ARG_SRC_1);                       // NOLINT
+    }
+
+    static std::vector<shape> to_shapes(const std::vector<argument>& args)
+    {
+        std::vector<shape> shapes(args.size());
+        std::transform(args.begin(), args.end(), shapes.begin(), [](const argument& a) {
+            return a.get_shape();
+        });
+        return shapes;
+    }
+    static std::string impl(const Primitive& prim)
+    {
+        auto desc       = prim.get_primitive_desc();
+        const char* str = nullptr;
+#ifdef MIGRAPHX_ENABLE_ZENDNN
+        zendnn_primitive_desc_query(
+            desc, zendnn_query_impl_info_str, 0, reinterpret_cast<void*>(&str));
+#else
+        dnnl_primitive_desc_query(desc, dnnl_query_impl_info_str, 0, reinterpret_cast<void*>(&str));
+#endif
+        return str == nullptr ? "" : str;
+    }
+    // Map arg index to arg in dnnl
+    std::vector<int> arg_map(int size) const
+    {
+        std::vector<int> result(size);
+        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0));
+        return result;
+    }
+    shape base_adjust_shape(const shape& s, const shape& output) const
+    {
+        if(s.broadcasted())
+        {
+            auto lens    = s.lens();
+            auto strides = s.strides();
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lens.begin(),
+                           lens.begin(),
+                           [](auto stride, auto len) -> std::size_t {
+                               if(stride == 0)
+                                   return 1;
+                               else
+                                   return len;
+                           });
+            // Use the permutation of the output
+            return output.with_lens(s.type(), lens);
+        }
+        return s;
+    }
+    template <class F>
+    void for_each_post_op(F f) const
+    {
+        int i = 0;
+        for(auto&& op : post_ops)
+        {
+            if(contains(op.algo, "binary"))
+            {
+                f(op, get_binary_post_op_arg(i));
+            }
+            else
+            {
+                f(op, -1);
+            }
+            i++;
+        }
+    }
+    shape adjust_shape(const shape& s, int, const shape& output) const
+    {
+        return base_adjust_shape(s, output);
+    }
+    std::vector<int> create_arg_map(std::size_t input_size) const
+    {
+        const auto& self     = static_cast<const Derived&>(*this);
+        auto npost_ops       = get_extra_post_op_args();
+        auto prim_input_size = input_size - npost_ops;
+        auto m               = self.arg_map(prim_input_size);
+        for_each_post_op([&](auto&&, auto arg) {
+            if(arg < 0)
+                return;
+            m.push_back(arg);
+        });
+        return m;
+    }
+    std::unordered_map<int, dnnl::memory::desc>
+    to_memory_desc(const shape& output_shape, const std::vector<shape>& inputs) const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        std::unordered_map<int, dnnl::memory::desc> result;
+        result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
+            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size(), output_shape));
+        auto m = create_arg_map(inputs.size());
+        assert(m.size() >= inputs.size());
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i, output_shape));
+        }
+        return result;
+    }
+    dnnl::primitive_attr
+    get_primitive_attr(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        dnnl::primitive_attr result;
+        dnnl::post_ops po;
+        for_each_post_op([&](auto&& op, auto arg) {
+            if(contains(op.algo, "binary_add"))
+            {
+                auto desc = m.at(arg);
+                if(desc == m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)))
+                    po.append_sum(1.0f);
+                else
+                    po.append_binary(to_dnnl_algo(op.algo), m.at(arg));
+            }
+            else if(contains(op.algo, "binary"))
+            {
+                po.append_binary(to_dnnl_algo(op.algo), m.at(arg));
+            }
+            else if(contains(op.algo, "eltwise"))
+                po.append_eltwise(1.0f, to_dnnl_algo(op.algo), op.alpha, op.beta);
+            else
+                MIGRAPHX_THROW("Unknown post op algo: " + op.algo);
+        });
+        result.set_post_ops(po);
+        return result;
+    }
+    template <class T>
+    auto get_primitive_desc(const T& desc, const dnnl::primitive_attr& attr) const
+        -> decltype(typename Primitive::primitive_desc(desc, attr, get_dnnl_context().engine))
+    {
+        return typename Primitive::primitive_desc(desc, attr, get_dnnl_context().engine);
+    }
+    Primitive get_primitive(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        auto desc        = self.get_desc(m);
+        auto attr        = MIGRAPHX_ASSERT_NO_THROW(this->get_primitive_attr(m));
+        auto pd          = self.get_primitive_desc(desc, attr);
+        return Primitive(pd);
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        return execute(ctx, args);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    value compile(context&, const shape& output_shape, std::vector<shape> inputs)
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        auto md        = to_memory_desc(output_shape, inputs);
+        auto prim      = get_primitive(md);
+        auto impl_name = impl(prim);
+        return {{"impl", impl_name}};
+    }
+
+    void finalize(context&, const shape& output_shape, std::vector<shape> inputs)
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        const auto& self = static_cast<const Derived&>(*this);
+        auto name        = self.name();
+        auto md          = to_memory_desc(output_shape, inputs);
+        auto prim        = get_primitive(md);
+        auto arg_lookup  = create_arg_map(inputs.size());
+#ifndef NDEBUG
+        auto prim_attr = get_primitive_attr(md);
+#endif
+        execute = make_execute_wrapper([=](const std::vector<argument>& args) {
+#ifndef NDEBUG
+            // Check that the memory descriptors have not changed
+            auto debug_args = args;
+            debug_args.pop_back();
+            auto debug_md = to_memory_desc(output_shape, to_shapes(debug_args));
+            for(auto&& p : debug_md)
+            {
+                if(md.count(p.first) == 0)
+                    MIGRAPHX_THROW(name +
+                                   ": Missing memory descriptor for: " + std::to_string(p.first));
+                if(p.second == md.at(p.first))
+                    continue;
+                MIGRAPHX_THROW(name +
+                               ": Memory descriptor has changed for: " + std::to_string(p.first));
+            }
+            // Check post_ops args are correct
+            auto pos             = prim_attr.get_post_ops();
+            auto prim_input_size = inputs.size() - this->get_extra_post_op_args();
+            int j                = 0;
+            for(int i = 0; i < pos.len(); i++)
+            {
+                auto arg  = j + prim_input_size;
+                auto kind = pos.kind(i);
+                std::string mesg =
+                    "Post op " + std::to_string(i) + "@" + std::to_string(arg) + ": ";
+                try
+                {
+                    dnnl::algorithm algo;
+                    dnnl::memory::desc mdesc;
+                    float scale = 0;
+                    float alpha = 0;
+                    float beta  = 0;
+                    if(kind == dnnl::primitive::kind::binary)
+                    {
+                        pos.get_params_binary(i, algo, mdesc);
+                        if(mdesc != md.at(arg_lookup.at(arg)))
+                            MIGRAPHX_THROW(mesg +
+                                           "Memory descriptor doesn't match for binary post op");
+                        j++;
+                    }
+                    else if(kind == dnnl::primitive::kind::eltwise)
+                    {
+                        pos.get_params_eltwise(i, scale, algo, alpha, beta);
+                    }
+                    else if(kind == dnnl::primitive::kind::sum)
+                    {
+                        pos.get_params_sum(i, scale);
+                        algo = dnnl::algorithm::binary_add;
+                    }
+                    else
+                    {
+                        MIGRAPHX_THROW("Unknown kind");
+                    }
+                    if(to_dnnl_algo(post_ops[i].algo) != algo)
+                        MIGRAPHX_THROW(mesg + "Algorithm doesn't match for post op " +
+                                       post_ops[i].algo + " != " + to_string(algo));
+                }
+                catch(const dnnl::error& e)
+                {
+                    MIGRAPHX_THROW(mesg + "Failed to get post ops argument " + ": " + e.what());
+                }
+            }
+#endif
+            std::unordered_map<int, dnnl::memory> m;
+            m[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
+                to_dnnl_memory(md.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)), args.back());
+            for(int i = 0; i < args.size() - 1; i++)
+                m[arg_lookup[i]] = to_dnnl_memory(md.at(arg_lookup[i]), args[i]);
+            prim.execute(get_dnnl_context().stream, m);
+            return args.back();
+        });
+    }
+    std::vector<shape> trim_post_op_inputs(const std::vector<shape>& inputs) const
+    {
+        auto prim_input_size = inputs.size() - this->get_extra_post_op_args();
+        return {inputs.begin(), inputs.begin() + prim_input_size};
+    }
+};
+
+template <class Derived, class Primitive, class Op>
+struct dnnl_extend_op : dnnl_op<Derived, Primitive>
+{
+    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(self.reflect_base(self, f), migraphx::reflect(self.op, f));
+    }
+
+    // dnnl has some issues with non-packed inputs
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.packed_or_broadcasted();
+    }
+
+    std::string name() const { return "dnnl::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        // Compensate for allocation
+        inputs.pop_back();
+        self.required(check_shapes(inputs, self));
+        auto r = migraphx::compute_shape(op, this->trim_post_op_inputs(inputs));
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/fuse_ops.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
new file mode 100644
index 000000000..e0918846a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
+#define MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
+
+#include <migraphx/cpu/context.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace cpu {
+
+struct MIGRAPHX_CPU_EXPORT fuse_ops
+{
+    context* ctx = nullptr;
+    std::string name() const { return "cpu::fuse_ops"; }
+    void apply(module& m) const;
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/lowering.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/lowering.hpp
new file mode 100644
index 000000000..d4b96c543
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/lowering.hpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
+
+#include <migraphx/cpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace cpu {
+
+struct MIGRAPHX_CPU_EXPORT lowering
+{
+    std::string name() const { return "cpu::lowering"; }
+    void apply(module& m) const;
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/parallel.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/parallel.hpp
new file mode 100644
index 000000000..cb3b9ed64
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/parallel.hpp
@@ -0,0 +1,125 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
+
+// #define MIGRAPHX_DISABLE_OMP
+#include <cmath>
+#include <cassert>
+#include <migraphx/config.hpp>
+#ifdef MIGRAPHX_DISABLE_OMP
+#include <migraphx/par_for.hpp>
+#else
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#include <omp.h>
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+#ifdef MIGRAPHX_DISABLE_OMP
+
+inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
+
+template <class F>
+void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        f(std::size_t{0}, n);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+// Using const here causes gcc 5 to ICE
+#if(!defined(__GNUC__) || __GNUC__ != 5)
+        const
+#endif
+            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+
+        std::size_t work = 0;
+        std::generate(threads.begin(), threads.end(), [=, &work] {
+            auto result = joinable_thread([=]() mutable {
+                assert(work < n);
+                f(work, std::min(n, work + grainsize));
+            });
+            work += grainsize;
+            return result;
+        });
+        // cppcheck-suppress unsignedLessThanZero
+        assert(work >= n);
+    }
+}
+#else
+
+inline std::size_t max_threads() { return omp_get_max_threads(); }
+
+template <class F>
+void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        f(std::size_t{0}, n);
+    }
+    else
+    {
+        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
+#pragma omp parallel for num_threads(threadsize) schedule(static, 1)
+        for(std::size_t tid = 0; tid < threadsize; tid++)
+        {
+            std::size_t work = tid * grainsize;
+            assert(work < n);
+            f(work, std::min(n, work + grainsize));
+        }
+    }
+}
+#endif
+template <class F>
+void parallel_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
+    parallel_for_impl(n, threadsize, f);
+}
+
+template <class F>
+void parallel_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    parallel_for(n, min_grain, f);
+}
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/pointwise.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/pointwise.hpp
new file mode 100644
index 000000000..ece5498c8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -0,0 +1,414 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
+
+#include <array>
+#include <migraphx/config.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct multi_index
+{
+    constexpr multi_index() = default;
+
+    multi_index(const shape& s, std::size_t i) : n(s.lens().size())
+    {
+        assert(n < max_size);
+        std::copy(s.lens().begin(), s.lens().end(), dims);
+        s.multi_copy(i, index, index + max_size);
+    }
+
+    constexpr std::size_t size() const { return n; }
+
+    constexpr std::size_t* begin() { return index; }
+    constexpr const std::size_t* begin() const { return index; }
+
+    constexpr std::size_t* end() { return index + size(); }
+    constexpr const std::size_t* end() const { return index + size(); }
+
+    std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
+
+    constexpr void carry()
+    {
+        std::size_t overflow = 0;
+        for(std::ptrdiff_t i = size() - 1; i > 0; i--)
+        {
+            auto z = index[i] + overflow;
+            // Reset overflow
+            overflow = 0;
+            // Compute overflow using while loop instead of mod
+            // overflow = z / dims[i];
+            // z = z % dims[i];
+            while(z >= dims[i])
+            {
+                z -= dims[i];
+                overflow += 1;
+            }
+            index[i] = z;
+            // Exit if there is no overflow
+            if(overflow == 0)
+                return;
+        }
+        index[0] += overflow;
+    }
+
+    constexpr void increment(std::size_t i)
+    {
+        index[size() - 1] += i;
+        carry();
+    }
+
+    constexpr multi_index& operator+=(std::size_t i)
+    {
+        increment(i);
+        return *this;
+    }
+
+    constexpr multi_index& operator++()
+    {
+        increment(1);
+        return *this;
+    }
+    multi_index operator++(int) // NOLINT
+    {
+        multi_index result = *this;
+        increment(1);
+        return result;
+    }
+
+    private:
+    static const std::size_t max_size = 5;
+    std::size_t index[max_size]       = {};
+    std::size_t dims[max_size]        = {};
+    std::size_t n                     = 0;
+};
+
+struct reduce_dims_base
+{
+    std::vector<shape> reduce_shapes;
+
+    void finalize(context&, const shape&, const std::vector<shape>& inputs)
+    {
+        reduce_shapes = reduce_dims(inputs);
+    }
+
+    argument get_arg(const std::vector<argument>& args, std::size_t i) const
+    {
+        if(reduce_shapes.empty())
+            return args[i];
+        return args.at(i).reshape(reduce_shapes.at(i));
+    }
+
+    argument get_output() const
+    {
+        argument a{reduce_shapes[0]};
+        return a;
+    }
+};
+
+template <class T, std::size_t N>
+struct vec
+{
+    using array_type                                              = std::array<T, N>;
+    using vector_type __attribute__((vector_size(N * sizeof(T)))) = T;
+    union
+    {
+        array_type array;
+        vector_type vector;
+    };
+
+    static_assert(sizeof(array_type) == sizeof(vector_type), "Not the same size");
+};
+
+template <class T>
+constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
+{
+    return {};
+}
+
+template <class T, std::size_t N>
+constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
+{
+    return {};
+}
+
+template <class T>
+constexpr std::size_t vec_size()
+{
+    return decltype(vec_size(std::declval<T>())){};
+}
+
+template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() > 0))>
+void vec_apply(F f, V& v, Vs... vs)
+{
+    assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
+    assert(vec_size<V>() == v.array.size());
+    for(std::size_t i = 0; i < vec_size<V>(); i++)
+        f(v.array[i], vs.vector[i]...);
+}
+
+template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() == 0))>
+void vec_apply(F f, V& v, Vs&... vs)
+{
+    f(v, vs...);
+}
+
+inline std::size_t find_packed_len(const shape& s)
+{
+    for(std::size_t i = 0; i < s.lens().size(); i++)
+    {
+        if(s.lens()[i] > 1 and s.strides()[i] == 1)
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+template <std::size_t N>
+shape vectorize(const shape& s)
+{
+    assert(s.standard() or s.broadcasted());
+    auto lens = s.lens();
+    if(s.broadcasted())
+    {
+        auto n = find_packed_len(s);
+        assert(n != -1);
+        assert((lens[n] % N) == 0);
+        lens[n] /= N;
+        return {s.type(), lens, s.strides()};
+    }
+    assert((lens.back() % N) == 0);
+    lens.back() /= N;
+    return {s.type(), lens};
+}
+
+template <std::size_t N, class T>
+tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
+{
+    return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
+}
+
+template <class T>
+struct is_vector_type : std::false_type
+{
+};
+
+template <>
+struct is_vector_type<float> : std::true_type
+{
+};
+
+template <class... Ts>
+struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...>
+{
+};
+
+template <std::size_t N, class... Xs>
+bool is_vectorizable(const Xs&... xs)
+{
+    return all_of({xs...}, [](const auto& s) {
+        if(s.standard() and (s.lens().back() % N) == 0)
+            return true;
+        if(s.broadcasted())
+        {
+            auto n = std::inner_product(s.lens().begin(),
+                                        s.lens().end(),
+                                        s.strides().begin(),
+                                        0,
+                                        std::plus<>{},
+                                        [&](auto len, auto stride) -> std::size_t {
+                                            if(stride > 0 and len == 1)
+                                                return 0;
+                                            return stride;
+                                        });
+            if(n == 1)
+            {
+                auto i = find_packed_len(s);
+                assert(i != -1);
+                return (s.lens()[i] % N) == 0;
+            }
+        }
+        return false;
+    });
+}
+
+template <class... Ts, MIGRAPHX_REQUIRES(is_vector_tensor_view<Ts...>{})>
+auto auto_vectorize(const shape& base_shape, Ts... xs)
+{
+    return [=](auto f) {
+        if(is_vectorizable<32>(base_shape, xs.get_shape()...))
+            f(vectorize<32>(base_shape), vectorize<32>(xs)...);
+        else if(is_vectorizable<8>(base_shape, xs.get_shape()...))
+            f(vectorize<8>(base_shape), vectorize<8>(xs)...);
+        else
+            f(base_shape, xs...);
+    };
+}
+
+template <class... Ts, MIGRAPHX_REQUIRES(not is_vector_tensor_view<Ts...>{})>
+auto auto_vectorize(const shape& base_shape, Ts... xs)
+{
+    return [=](auto f) { f(base_shape, xs...); };
+}
+
+template <class X, class... Xs>
+bool is_standard_offset(const X& x, const Xs&... xs)
+{
+    if(all_of({x, xs...}, [](const auto& s) { return s.standard(); }))
+        return true;
+    if(all_of({x, xs...}, [](const auto& s) { return s.packed(); }) and
+       all_of({xs...}, [&](const auto& s) { return s == x; }))
+        return true;
+    return false;
+}
+
+template <class... Ts>
+auto pointwise_apply(Ts... ts)
+{
+    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+        if(is_standard_offset(ts.get_shape()...))
+        {
+            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
+                for(auto i = start; i < end; i++)
+                {
+                    vec_apply(f, ts.data()[i]...);
+                }
+            });
+        }
+        else
+        {
+            assert(base_shape.lens().size() <= 6);
+            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
+                multi_index mi(base_shape, start);
+                for(auto i = start; i < end; i++)
+                {
+                    vec_apply(f, ts.data()[mi.offset(ts.get_shape())]...);
+                    ++mi;
+                }
+            });
+        }
+    };
+}
+
+template <class... Ts>
+auto pointwise(Ts... ts)
+{
+    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+        auto_vectorize(base_shape, ts...)(
+            [&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
+    };
+}
+
+template <class Op>
+struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
+{
+    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        const auto& s = inputs.at(0);
+        return {s.type(), s.lens()};
+    }
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+
+        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
+            auto op2 = op;
+            pointwise(output, input)(
+                ctx, output.get_shape(), 1024, [op2](auto& y, auto x) { y = op2.apply()(x); });
+        });
+
+        return result.reshape(output_shape);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+template <class Op>
+struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
+{
+    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        const auto& s = inputs.at(0);
+        return {s.type(), s.lens()};
+    }
+
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+
+        visit_all(result, get_arg(args, 0), get_arg(args, 1))(
+            [&](auto output, auto input1, auto input2) {
+                auto op2 = op;
+                pointwise(output, input1, input2)(
+                    ctx, output.get_shape(), 1024, [op2](auto& z, auto x, auto y) {
+                        z = op2.apply()(x, y);
+                    });
+            });
+
+        return result.reshape(output_shape);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/target.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/target.hpp
new file mode 100644
index 000000000..589b680fe
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/target.hpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/compile_options.hpp>
+#include <migraphx/cpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct pass;
+namespace cpu {
+
+struct MIGRAPHX_CPU_EXPORT target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options&) const;
+    migraphx::context get_context() const { return context{}; }
+    argument copy_to(const argument& arg) const { return arg; }
+    argument copy_from(const argument& arg) const { return arg; }
+    argument allocate(const shape& s) const;
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/write_literals.hpp b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/write_literals.hpp
new file mode 100644
index 000000000..3c23fb14f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/include/migraphx/cpu/write_literals.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
+
+#include <migraphx/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace cpu {
+
+struct write_literals
+{
+    std::string name() const { return "cpu::write_literals"; }
+    void apply(module& m) const;
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/cpu/layernorm.cpp b/docker/rocm/migraphx/targets/cpu/layernorm.cpp
new file mode 100644
index 000000000..0d19eb827
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/layernorm.cpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_layernorm : dnnl_op<dnnl_layernorm, dnnl::layer_normalization_forward>
+{
+    float epsilon = 1e-12f;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.epsilon, "epsilon"));
+    }
+
+    std::string name() const { return "dnnl::layernorm"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1);
+        auto s = inputs.at(0);
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(s, inputs));
+        return s;
+    }
+
+    dnnl::layer_normalization_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::prop_kind::forward_inference,
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                1e-12f,
+                dnnl::normalization_flags::none};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/logsoftmax.cpp b/docker/rocm/migraphx/targets/cpu/logsoftmax.cpp
new file mode 100644
index 000000000..e4bb88dc8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/logsoftmax.cpp
@@ -0,0 +1,44 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forward, op::logsoftmax>
+{
+    dnnl::logsoftmax_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        int axis = this->op.axis;
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/lowering.cpp b/docker/rocm/migraphx/targets/cpu/lowering.cpp
new file mode 100644
index 000000000..a68eae820
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/lowering.cpp
@@ -0,0 +1,502 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/cpu/lowering.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/elu.hpp>
+#include <migraphx/op/im2col.hpp>
+#include <migraphx/op/leaky_relu.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/op/lrn.hpp>
+#include <migraphx/op/pad.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/op/softmax.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/par_dfor.hpp>
+#include <migraphx/clamp.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/match/layernorm.hpp>
+#include <migraphx/match/gelu_erf.hpp>
+#include <migraphx/match/gelu_tanh.hpp>
+#include <migraphx/matcher.hpp>
+#include <unordered_map>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+template <typename T>
+T zero(const T&)
+{
+    return T(0);
+}
+
+template <class T>
+typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::enable_if<true, T>>::
+    type
+    make_signed(T x)
+{
+    return x;
+}
+
+struct cpu_im2col
+{
+    op::im2col op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    static std::string name() { return "cpu::im2col"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        return op.normalize_compute_shape(inputs);
+    }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto input_shape   = args[0].get_shape();
+        auto weights_shape = args[1].get_shape();
+        visit_all(result, args[0])([&](auto col, auto input) {
+            const std::size_t& height   = input_shape.lens()[2];
+            const std::size_t& width    = input_shape.lens()[3];
+            const std::size_t& channels = weights_shape.lens()[1];
+            const std::size_t& kernel_h = weights_shape.lens()[2];
+            const std::size_t& kernel_w = weights_shape.lens()[3];
+            const std::size_t& pad_h    = op.padding[0];
+            const std::size_t& pad_w    = op.padding[1];
+            const std::size_t& stride_h = op.stride[0];
+            const std::size_t& stride_w = op.stride[1];
+
+            long kdiv2_h = long(kernel_h) / 2;
+            long kdiv2_w = long(kernel_w) / 2;
+            // calculate output sizes
+            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
+            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
+            // account for padding for the starting position of the input pixels
+            long iinput = kdiv2_h - long(pad_h);
+            // loop over output pixels (ioutput, joutput)
+            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
+            {
+                long jinput = kdiv2_w - long(pad_w);
+                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
+                {
+                    // compute linear index for output
+                    std::size_t ldx = ioutput * col_width + joutput;
+                    std::size_t p   = 0;
+                    dfor(channels,
+                         kernel_h,
+                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
+                        auto idx    = iinput + long(koffset) - kdiv2_h;
+                        auto jdx    = jinput + long(loffset) - kdiv2_w;
+                        col(ldx, p) =
+                            ((idx >= 0) and (idx < height) and (jdx >= 0) and (jdx < width))
+                                ? input(0, c, idx, jdx)
+                                : 0;
+                        p++;
+                    });
+                }
+            }
+        });
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(cpu_im2col)
+
+struct cpu_op
+{
+    operation op = op::identity{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::op"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        return op.compute(output_shape, args);
+    }
+    value to_value() const
+    {
+        value v;
+        v["name"]     = op.name();
+        v["operator"] = op.to_value();
+        return v;
+    }
+    void from_value(const value& v)
+    {
+        op = make_op(v.at("name").to<std::string>(), v.at("operator"));
+    }
+    friend std::ostream& operator<<(std::ostream& os, const cpu_op& x)
+    {
+        os << "cpu::" << x.op;
+        return os;
+    }
+};
+MIGRAPHX_REGISTER_OP(cpu_op)
+
+struct cpu_pad
+{
+    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::pad"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        assert(output_shape.standard());
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            using type = typename decltype(output)::value_type;
+            std::fill(output.begin(), output.end(), pad_clamp<type>(op.value));
+        });
+
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(input.get_shape(), [&](const auto& idx) {
+                std::vector<std::size_t> new_idx(idx.size());
+                std::transform(
+                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
+                        return i + j;
+                    });
+                output(new_idx.begin(), new_idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(cpu_pad)
+
+struct cpu_rnn_var_sl_last_output
+{
+    op::rnn_var_sl_last_output op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::rnn_var_sl_last_output"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        return op.compute_shape(std::move(inputs));
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto out_comp_lens = args[0].get_shape().lens();
+        out_comp_lens[0]   = 1;
+        shape out_comp_s{output_shape.type(), out_comp_lens};
+
+        visit_all(result, args[0])([&](auto output, auto input) {
+            args[1].visit([&](auto seq_lens) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto idx = out_comp_s.multi(i);
+                    auto b   = idx[2];
+                    if(op.direction == op::rnn_direction::reverse or idx[1] == 1)
+                    {
+                        idx[0] = 0;
+                    }
+                    else
+                    {
+                        idx[0] = seq_lens[b] - 1;
+                    }
+                    output[i] = input(idx.begin(), idx.end());
+                });
+            });
+        });
+
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
+
+struct cpu_apply
+{
+    module* modl;
+    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
+    instruction_ref last{};
+
+    void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
+    {
+        apply_map.emplace(op_name, [=](instruction_ref ins) {
+            auto&& op = ins->get_operator();
+            if(allocate)
+                return replace(ins, make_op(cpu_name, op.to_value()));
+            return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs());
+        });
+    }
+
+    void extend_dnnl_algos(const std::string& dnnl_name,
+                           const std::vector<std::pair<std::string, std::string>>& algos)
+    {
+        for(auto&& pp : algos)
+        {
+            std::string op_name = pp.first;
+            std::string algo    = pp.second;
+            apply_map.emplace(op_name, [=](instruction_ref ins) {
+                auto v = ins->get_operator().to_value();
+                if(not v.is_object())
+                    return ins;
+                v["algo"] = algo;
+                auto op   = make_op(dnnl_name, v);
+                return replace(ins, op);
+            });
+        }
+    }
+
+    template <class M>
+    auto fuse_match(M matcher, const operation& op, const std::vector<std::string>& bind_inputs)
+    {
+        return match::make_match_finder(matcher, [=](auto&, const auto& r) {
+            auto ins = r.result;
+            std::vector<instruction_ref> inputs;
+            std::transform(bind_inputs.begin(),
+                           bind_inputs.end(),
+                           std::back_inserter(inputs),
+                           [&](const auto& s) { return r.instructions[s]; });
+            inputs.push_back(this->insert_allocation(ins, ins->get_shape()));
+            modl->replace_instruction(ins, op, inputs);
+        });
+    }
+
+    void init()
+    {
+        extend_dnnl_algos("dnnl::binary",
+                          {
+                              {"add", "binary_add"},
+                              {"div", "binary_div"},
+                              {"max", "binary_max"},
+                              {"min", "binary_min"},
+                              {"mul", "binary_mul"},
+                          });
+
+        extend_dnnl_algos("dnnl::eltwise",
+                          {
+                              {"abs", "eltwise_abs"},
+                              {"elu", "eltwise_elu"},
+                              {"exp", "eltwise_exp"},
+                              {"log", "eltwise_log"},
+                              {"relu", "eltwise_relu"},
+                              {"sqrt", "eltwise_sqrt"},
+                              {"tanh", "eltwise_tanh"},
+                          });
+
+        extend_dnnl_algos("dnnl::reduction",
+                          {
+                              {"reduce_max", "reduction_max"},
+                              {"reduce_mean", "reduction_mean"},
+                              {"reduce_min", "reduction_min"},
+                              {"reduce_sum", "reduction_sum"},
+                          });
+        extend_op("concat", "dnnl::concat");
+        extend_op("contiguous", "dnnl::reorder");
+        extend_op("convolution", "dnnl::convolution");
+#ifndef MIGRAPHX_ENABLE_ZENDNN
+        extend_op("convolution_backwards", "dnnl::convolution_backwards");
+        extend_op("dot", "dnnl::dot");
+#endif
+        extend_op("erf", "cpu::erf");
+        extend_op("gather", "cpu::gather");
+        extend_op("logsoftmax", "dnnl::logsoftmax");
+        extend_op("lrn", "dnnl::lrn");
+        extend_op("softmax", "dnnl::softmax");
+
+        extend_op("im2col", "cpu::im2col", false);
+        extend_op("leaky_relu", "cpu::leaky_relu", false);
+        extend_op("pad", "cpu::pad", false);
+        extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output", false);
+    }
+
+    void apply()
+    {
+        init();
+        // Apply fusion matchers first
+        match::find_matches(*modl,
+                            fuse_match(match::gelu_erf(),
+                                       make_op("dnnl::eltwise", {{"algo", "eltwise_gelu_erf"}}),
+                                       {"x"}),
+                            fuse_match(match::gelu_tanh(),
+                                       make_op("dnnl::eltwise", {{"algo", "eltwise_gelu_tanh"}}),
+                                       {"x"}),
+                            fuse_match(match::layernorm(), make_op("dnnl::layernorm"), {"x"}));
+        // Apply these operators first so the inputs can be const folded
+        for(auto it : iterator_for(*modl))
+        {
+            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
+            // supported yet.
+            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
+                   return contains(fp8_types{}.get(), i->get_shape().type());
+               }))
+                continue;
+            if(it->name() == "pow")
+            {
+                apply_pow(it);
+            }
+        }
+        for(auto it : iterator_for(*modl))
+        {
+            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
+            // supported yet.
+            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
+                   return contains(fp8_types{}.get(), i->get_shape().type());
+               }))
+                continue;
+            if(it->name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(it->name() == "reshape")
+            {
+                apply_reshape(it);
+            }
+            else if(apply_map.count(it->name()) > 0)
+            {
+                apply_map.at(it->name())(it);
+            }
+        }
+    }
+
+    instruction_ref apply_pow(instruction_ref ins) const
+    {
+        auto beta = read_scalar<float>(ins->inputs()[1]);
+        if(beta.empty())
+            return ins;
+        return replace(ins,
+                       make_op("dnnl::eltwise",
+                               {{"algo", "eltwise_pow"}, {"alpha", 1.0}, {"beta", beta.front()}}),
+                       {ins->inputs().front()});
+    }
+
+    // TODO:  update lowering to run the reference
+    // code when OneDNN can't execute pooling for a CPU
+
+    // OneDNN has a limitation on padding size for pooling.  see
+    // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
+
+    // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
+    // padding = {2}; stride = {1}; lengths = {2} fails.
+    // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
+    // ("weights tensor") that MIGraphX doesn't enforce.
+    instruction_ref apply_pooling(instruction_ref ins) const
+    {
+        auto&& op = ins->get_operator();
+        auto v    = op.to_value();
+        if(has_op("dnnl::pooling") and ins->get_shape().type() == shape::type_t::float_type and
+           not v["ceil_mode"].to<bool>() and
+           v["mode"].to<op::pooling_mode>() != op::pooling_mode::lpnorm)
+            return replace(ins, make_op("dnnl::pooling", op.to_value()));
+        return ins;
+    }
+    /*
+    Lowers reshape copy operator to reshape lazy by inserting contiguous operators around it.
+    Contiguous ops will later by removed by eliminate_contiguous pass.
+    */
+    instruction_ref apply_reshape(instruction_ref ins) const
+    {
+        std::vector<instruction_ref> before_contiguous_args = ins->inputs();
+        auto before_alloc =
+            insert_allocation(ins, before_contiguous_args.front()->get_shape().as_standard());
+        before_contiguous_args.push_back(before_alloc);
+        auto before_contig =
+            modl->insert_instruction(ins, make_op("dnnl::reorder"), {before_contiguous_args});
+
+        auto new_lazy_reshape = modl->insert_instruction(
+            ins,
+            make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value().at("dims")}}}),
+            before_contig);
+
+        std::vector<instruction_ref> after_contiguous_args = {new_lazy_reshape};
+        auto after_alloc = insert_allocation(new_lazy_reshape, new_lazy_reshape->get_shape());
+        after_contiguous_args.push_back(after_alloc);
+        return modl->replace_instruction(ins, make_op("dnnl::reorder"), after_contiguous_args);
+    }
+
+    template <class T>
+    static std::vector<T> read_scalar(instruction_ref ins)
+    {
+        if(ins->name() == "contiguous")
+            return read_scalar<T>(ins->inputs().front());
+        if(ins->get_shape().elements() != 1 and not ins->get_shape().scalar())
+            return {};
+        auto r = ins->eval();
+        if(r.empty())
+            return {};
+        return {r.at<T>()};
+    }
+
+    instruction_ref replace(instruction_ref ins, const operation& op) const
+    {
+        return replace(ins, op, ins->inputs());
+    }
+
+    instruction_ref
+    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs) const
+    {
+        inputs.push_back(insert_allocation(ins, ins->get_shape()));
+        return modl->replace_instruction(ins, op, inputs);
+    }
+
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
+    {
+        return modl->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
+    }
+};
+
+void lowering::apply(module& m) const { cpu_apply{&m}.apply(); }
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/lrn.cpp b/docker/rocm/migraphx/targets/cpu/lrn.cpp
new file mode 100644
index 000000000..bd4c27129
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/lrn.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/lrn.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
+{
+    dnnl::lrn_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::lrn_across_channels,
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
+                this->op.size,
+                this->op.alpha,
+                this->op.beta,
+                this->op.bias};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/mod.cpp b/docker/rocm/migraphx/targets/cpu/mod.cpp
new file mode 100644
index 000000000..e28bdb19d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/mod.cpp
@@ -0,0 +1,36 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+template struct cpu_binary<op::mod>;
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/pooling.cpp b/docker/rocm/migraphx/targets/cpu/pooling.cpp
new file mode 100644
index 000000000..d10ed75a6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/pooling.cpp
@@ -0,0 +1,83 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/pooling.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_v2_forward, op::pooling>
+{
+    std::vector<int> arg_map(int) const { return {MIGRAPHX_DNNL_PREFIX(ARG_SRC)}; }
+
+    dnnl::algorithm get_algo() const
+    {
+        switch(op.mode)
+        {
+        case op::pooling_mode::max: return dnnl::algorithm::pooling_max;
+        case op::pooling_mode::average:
+            return op.count_include_pad ? dnnl::algorithm::pooling_avg_include_padding
+                                        : dnnl::algorithm::pooling_avg_exclude_padding;
+        case op::pooling_mode::lpnorm: MIGRAPHX_THROW("Lpnorn pooling mode not supported");
+        }
+        MIGRAPHX_THROW("Unknown pooling mode");
+    }
+
+    dnnl::pooling_v2_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        auto algo  = get_algo();
+        auto kdims = op.kdims();
+        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
+        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
+        // Note: It is not documented, but the default dilation seems to be 0 instead of 1.
+        //       We need to offset dilations with -1.
+        std::vector<size_t> dilations;
+        std::transform(op.dilations.cbegin(),
+                       op.dilations.cend(),
+                       std::back_inserter(dilations),
+                       [](size_t d) { return d - 1; });
+        return {dnnl::prop_kind::forward_inference,
+                algo,
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(op.lengths),
+                to_dnnl_dims(dilations),
+                to_dnnl_dims(padding_l),
+                to_dnnl_dims(padding_r)};
+    }
+};
+
+} // namespace cpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/preallocate.cpp b/docker/rocm/migraphx/targets/cpu/preallocate.cpp
new file mode 100644
index 000000000..d831a1942
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/preallocate.cpp
@@ -0,0 +1,60 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_preallocate : auto_register_op<cpu_preallocate>
+{
+    shape s;
+    std::string id = "";
+    argument data;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.id, "id"));
+    }
+
+    std::string name() const { return "cpu::preallocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape&, const std::vector<argument>&) const { return data; }
+    void finalize(context&, const shape&, const std::vector<shape>&) { data = argument(s); }
+    lifetime get_lifetime() const { return lifetime::global; }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/reduction.cpp b/docker/rocm/migraphx/targets/cpu/reduction.cpp
new file mode 100644
index 000000000..e0a7517ee
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/reduction.cpp
@@ -0,0 +1,73 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_reduction : dnnl_op<dnnl_reduction, dnnl::reduction>
+{
+    std::string algo;
+    std::vector<std::int64_t> axes{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(self.reflect_base(self, f),
+                         pack(f(self.algo, "algo"), f(self.axes, "axes")));
+    }
+
+    std::string name() const { return "dnnl::reduction"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{this->trim_post_op_inputs(inputs), *this}.has(1).standard();
+        auto s    = inputs.at(0);
+        auto lens = s.lens();
+        for(auto axis : axes)
+        {
+            lens[axis] = 1;
+        }
+        auto r = shape{s.type(), lens};
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+
+    dnnl::reduction::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {to_dnnl_algo(algo),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                0,
+                0};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/reorder.cpp b/docker/rocm/migraphx/targets/cpu/reorder.cpp
new file mode 100644
index 000000000..c549a6013
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/reorder.cpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
+{
+    std::string name() const { return "dnnl::reorder"; }
+
+    shape adjust_shape(const shape& x, int, const shape&) const { return x; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto r = inputs.back();
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+    // Custom desc class since its missing in dnnl
+    struct desc
+    {
+        dnnl::memory::desc src;
+        dnnl::memory::desc dst;
+    };
+    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)), m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
+    }
+
+    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const
+    {
+        auto& engine = get_dnnl_context().engine;
+        return dnnl::reorder::primitive_desc(engine, d.src, engine, d.dst, attr);
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/softmax.cpp b/docker/rocm/migraphx/targets/cpu/softmax.cpp
new file mode 100644
index 000000000..8c3610f23
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/softmax.cpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/softmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct dnnl_softmax : dnnl_extend_op<dnnl_softmax, dnnl::softmax_forward, op::softmax>
+{
+    dnnl::softmax_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        int axis = this->op.axis;
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
+    }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/sub.cpp b/docker/rocm/migraphx/targets/cpu/sub.cpp
new file mode 100644
index 000000000..8f3436071
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/sub.cpp
@@ -0,0 +1,36 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/sub.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+template struct cpu_binary<op::sub>;
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/target.cpp b/docker/rocm/migraphx/targets/cpu/target.cpp
new file mode 100644
index 000000000..e148aa5b6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/target.cpp
@@ -0,0 +1,122 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/adjust_allocation.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_allocation.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/eliminate_concat.hpp>
+#include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/eliminate_data_type.hpp>
+#include <migraphx/eliminate_identity.hpp>
+#include <migraphx/eliminate_pad.hpp>
+#include <migraphx/eliminate_convert.hpp>
+#include <migraphx/memory_coloring.hpp>
+#include <migraphx/propagate_constant.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/replace_allocate.hpp>
+#include <migraphx/rewrite_pooling.hpp>
+#include <migraphx/rewrite_quantization.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/schedule.hpp>
+#include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/preallocate_param.hpp>
+#include <migraphx/cpu/fuse_ops.hpp>
+#include <migraphx/cpu/write_literals.hpp>
+#include <migraphx/cpu/allocation_model.hpp>
+#include <migraphx/cpu/target.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/lowering.hpp>
+#include <migraphx/pass.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/normalize_ops.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+std::string target::name() const { return "cpu"; }
+
+// cppcheck-suppress constParameterReference
+std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
+{
+    auto& ctx = any_cast<context>(gctx);
+    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
+    std::set<std::string> unsupported_ops{
+        "all", "scatternd_add", "scatternd_mul", "scatternd_none"};
+    unsupported_types.erase(shape::type_t::float_type);
+    return {normalize_ops{},
+            rewrite_quantization{},
+            dead_code_elimination{},
+            eliminate_data_type{unsupported_types, shape::type_t::float_type, unsupported_ops},
+            dead_code_elimination{},
+            simplify_reshapes{},
+            eliminate_convert{},
+            eliminate_identity{},
+            eliminate_pad{},
+            dead_code_elimination{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            eliminate_common_subexpression{},
+            dead_code_elimination{},
+            simplify_algebra{},
+            simplify_reshapes{},
+            eliminate_convert{},
+            dead_code_elimination{},
+            simplify_reshapes{},
+            eliminate_convert{},
+            dead_code_elimination{},
+            simplify_algebra{},
+            simplify_reshapes{},
+            eliminate_convert{},
+            dead_code_elimination{},
+            propagate_constant{},
+            dead_code_elimination{},
+            auto_contiguous{},
+            lowering{},
+            eliminate_contiguous{"dnnl::reorder"},
+            dead_code_elimination{},
+            replace_allocate{cpu_allocation_model{}},
+            dead_code_elimination{},
+            adjust_allocation{cpu_allocation_model{}},
+            dead_code_elimination{},
+            fuse_ops{&ctx},
+            dead_code_elimination{},
+            write_literals{},
+            dead_code_elimination{},
+            memory_coloring{"cpu::allocate"},
+            dead_code_elimination{},
+            preallocate_param{"scratch", cpu_allocation_model{}},
+            dead_code_elimination{}};
+}
+
+argument target::allocate(const shape& s) const { return fill_argument(s, 0); }
+
+MIGRAPHX_REGISTER_TARGET(target);
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/cpu/write_literals.cpp b/docker/rocm/migraphx/targets/cpu/write_literals.cpp
new file mode 100644
index 000000000..0899df4e8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/cpu/write_literals.cpp
@@ -0,0 +1,70 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/cpu/write_literals.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_literal
+{
+    argument data;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.data, "data"));
+    }
+
+    std::string name() const { return "cpu::literal"; }
+
+    shape compute_shape(const std::vector<shape>&) const { return data.get_shape(); }
+
+    argument compute(const shape&, const std::vector<argument>&) const { return data; }
+
+    friend std::ostream& operator<<(std::ostream& os, const cpu_literal& x)
+    {
+        os << x.name();
+        return os;
+    }
+};
+MIGRAPHX_REGISTER_OP(cpu_literal);
+
+void write_literals::apply(module& m) const
+{
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "@literal")
+            continue;
+        m.replace_instruction(ins, cpu_literal{ins->get_literal().get_argument()});
+    }
+}
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/fpga/CMakeLists.txt b/docker/rocm/migraphx/targets/fpga/CMakeLists.txt
new file mode 100644
index 000000000..11b47b9b2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/CMakeLists.txt
@@ -0,0 +1,43 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+add_library(migraphx_fpga
+    target.cpp
+    lowering.cpp
+    subgraph.cpp
+    vitis_ai_adapter.cpp
+)
+
+set_target_properties(migraphx_fpga PROPERTIES EXPORT_NAME fpga)
+rocm_set_soversion(migraphx_fpga ${MIGRAPHX_SO_VERSION})
+
+rocm_clang_tidy_check(migraphx_fpga)
+target_link_libraries(migraphx_fpga migraphx)
+
+rocm_install_targets(
+    PRIVATE
+    TARGETS migraphx_fpga
+    INCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
diff --git a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/context.hpp b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/context.hpp
new file mode 100644
index 000000000..2c8242a76
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/context.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
+#define MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
+
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace fpga {
+
+struct context
+{
+    int id = 0;
+
+    void finish() const {}
+};
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_FPGA_CONTEXT_HPP
diff --git a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/lowering.hpp b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/lowering.hpp
new file mode 100644
index 000000000..dc8a7bc6b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/lowering.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_FPGA_LOWERING_HPP
+#define MIGRAPHX_GUARD_FPGA_LOWERING_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/fpga/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace fpga {
+
+struct lowering
+{
+    context* ctx = nullptr;
+    std::string name() const { return "fpga::lowering"; }
+    void apply(module& m) const;
+};
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_FPGA_LOWERING_HPP
diff --git a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/subgraph.hpp b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/subgraph.hpp
new file mode 100644
index 000000000..62f68b09d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/subgraph.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
+#define MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace fpga {
+
+struct subgraph
+{
+    std::string name() const { return "fpga::subgraph"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_FPGA_SUBGRAPH_HPP
diff --git a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/target.hpp b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/target.hpp
new file mode 100644
index 000000000..dbcb0bcff
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/target.hpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_FPGA_TARGET_HPP
+#define MIGRAPHX_GUARD_FPGA_TARGET_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/compile_options.hpp>
+#include <migraphx/fpga/context.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/supported_segments.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct pass;
+namespace fpga {
+
+struct target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
+    migraphx::context get_context() const { return context{}; }
+    supported_segments find_supported(const_module_ref mod, support_metric m) const;
+    argument copy_to(const argument& arg) const { return arg; }
+    argument copy_from(const argument& arg) const { return arg; }
+    argument allocate(const shape& s) const;
+};
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_FPGA_TARGET_HPP
diff --git a/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
new file mode 100644
index 000000000..64d2300c4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
+#define MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
+
+#include <string>
+
+#include <migraphx/instruction.hpp>
+#include <migraphx/pass_manager.hpp>
+
+namespace vitis_ai {
+
+class x_model
+{
+    migraphx::shape shape;
+
+    public:
+    migraphx::shape get_shape() const;
+    void set_shape(migraphx::shape);
+};
+
+x_model create_xmodel(migraphx::const_module_ref mod);
+
+migraphx::argument execute(const x_model& xmodel,
+                           const migraphx::shape& output_shape,
+                           std::vector<migraphx::argument>& args);
+
+} // namespace vitis_ai
+
+#endif // MIGRAPHX_GUARD_FPGA_VITIS_AI_ADAPTER_HPP
diff --git a/docker/rocm/migraphx/targets/fpga/lowering.cpp b/docker/rocm/migraphx/targets/fpga/lowering.cpp
new file mode 100644
index 000000000..ad17dc8d1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/lowering.cpp
@@ -0,0 +1,91 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/fpga/lowering.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/stringutils.hpp>
+#include <iostream>
+
+#include "migraphx/fpga/vitis_ai_adapter.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace fpga {
+
+struct fpga_vitis_op
+{
+    fpga_vitis_op() = default;
+    explicit fpga_vitis_op(vitis_ai::x_model model) : xmodel(std::move(model)){};
+
+    vitis_ai::x_model xmodel;
+    int dummy = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        // return pack(f(self.xmodel, "xmodel"));
+        return pack(f(self.dummy, "dummy"));
+    }
+
+    std::string name() const { return "fpga::vitis_ai"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        (void)inputs;
+        return xmodel.get_shape();
+    }
+
+    argument
+    compute(const context& ctx, const shape& output_shape, std::vector<argument> args) const
+    {
+        std::cout << "The context is " << ctx.id << std::endl;
+        return ::vitis_ai::execute(xmodel, output_shape, args);
+    }
+};
+MIGRAPHX_REGISTER_OP(fpga_vitis_op)
+
+void lowering::apply(module& m) const
+{
+    auto* mod = &m;
+
+    // test modifying the context from a pass
+    ctx->id = 2;
+
+    for(auto it : iterator_for(*mod))
+    {
+        if(it->name() == "fpga::vitis_placeholder")
+        {
+            assert(it->module_inputs().size() == 1);
+            auto xmodel = ::vitis_ai::create_xmodel(it->module_inputs()[0]);
+            mod->replace_instruction(it, fpga_vitis_op{xmodel}, it->inputs());
+        }
+    }
+}
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/fpga/subgraph.cpp b/docker/rocm/migraphx/targets/fpga/subgraph.cpp
new file mode 100644
index 000000000..d0e09a5de
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/subgraph.cpp
@@ -0,0 +1,133 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/fpga/subgraph.hpp>
+
+#include <migraphx/instruction.hpp>
+#include "migraphx/iterator.hpp"
+#include <migraphx/iterator_for.hpp>
+#include "migraphx/make_op.hpp"
+#include "migraphx/module.hpp"
+#include "migraphx/ranges.hpp"
+#include <migraphx/register_op.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/pass_manager.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace fpga {
+
+struct fpga_placeholder_op
+{
+    fpga_placeholder_op() = default;
+
+    int dummy = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dummy, "dummy"));
+    }
+
+    std::string name() const { return "fpga::vitis_placeholder"; }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        (void)inputs;
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("should have one submodule.");
+        }
+        module_ref sm = mods.front();
+        if(sm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("Only one return");
+        return sm->get_output_shapes().front();
+    }
+};
+MIGRAPHX_REGISTER_OP(fpga_placeholder_op)
+
+bool is_fpga_instr(migraphx::instruction_ref it)
+{
+    // assuming all instructions that aren't @param, @literal, or input data are fpga instrs
+    if(migraphx::starts_with(it->name(), "@"))
+    {
+        return false;
+    }
+    // no inputs to the instr means it's input data
+    if(it->inputs().empty())
+    {
+        return false;
+    }
+    return true;
+}
+
+void subgraph::apply(module_pass_manager& mpm) const
+{
+    auto& mod = mpm.get_module();
+    auto* pm  = mpm.create_module(mod.name() + ":fpga");
+    pm->set_bypass();
+
+    migraphx::instruction_ref first = mod.end();
+    migraphx::instruction_ref last;
+    std::vector<migraphx::instruction_ref> literal_inputs;
+    for(auto it : iterator_for(mod))
+    {
+        // assuming we want all the params/literals as inputs to the FPGA submodule
+        if(migraphx::starts_with(it->name(), "@param") or
+           migraphx::starts_with(it->name(), "@literal"))
+        {
+            literal_inputs.push_back(it);
+        }
+        if(is_fpga_instr(it))
+        {
+            if(first == mod.end())
+            {
+                first = it;
+            }
+            last = it;
+        }
+    }
+
+    // TODO(varunsh): this code may be replaceable by code in the fuse_pointwise pass
+
+    // assuming all FPGA instructions are in one contiguous range
+    pm->insert_instructions(pm->end(), first, std::next(last), {});
+    migraphx::instruction_ref placeholder_ins;
+    for(auto it : iterator_for(mod))
+    {
+        if(migraphx::starts_with(it->name(), "@return"))
+        {
+            placeholder_ins = mod.insert_instruction(
+                it, migraphx::make_op("fpga::vitis_placeholder"), literal_inputs, {pm});
+            break;
+        }
+    }
+
+    mod.replace_return({placeholder_ins});
+}
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/fpga/target.cpp b/docker/rocm/migraphx/targets/fpga/target.cpp
new file mode 100644
index 000000000..570779fff
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/target.cpp
@@ -0,0 +1,83 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/fpga/target.hpp>
+#include <migraphx/fpga/lowering.hpp>
+#include <migraphx/fpga/subgraph.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/pass.hpp>
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/eliminate_pad.hpp>
+#include <migraphx/insert_pad.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/normalize_ops.hpp>
+#include <migraphx/iterator_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace fpga {
+
+std::string target::name() const { return "fpga"; }
+
+std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
+{
+    // not sure if all these passes are needed but they were copied from ref/
+    auto& ctx = any_cast<context>(gctx);
+    return {normalize_ops{},
+            eliminate_pad{},
+            dead_code_elimination{},
+            insert_pad{},
+            dead_code_elimination{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            auto_contiguous{},
+            dead_code_elimination{},
+            subgraph{},
+            dead_code_elimination{},
+            lowering{&ctx},
+            dead_code_elimination{}};
+}
+
+argument target::allocate(const shape& s) const { return fill_argument(s, 0); }
+
+supported_segments target::find_supported(const_module_ref mod, support_metric m) const
+{
+    (void)m;
+
+    supported_segment instrs;
+    for(const auto ins : iterator_for(*mod))
+    {
+        instrs.instructions.insert(ins);
+    }
+    instrs.metric = 1; // arbitrary value
+    return {instrs};
+}
+
+MIGRAPHX_REGISTER_TARGET(target);
+
+} // namespace fpga
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/fpga/vitis_ai_adapter.cpp b/docker/rocm/migraphx/targets/fpga/vitis_ai_adapter.cpp
new file mode 100644
index 000000000..fa4ecdc68
--- /dev/null
+++ b/docker/rocm/migraphx/targets/fpga/vitis_ai_adapter.cpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "migraphx/fpga/vitis_ai_adapter.hpp"
+
+#include "migraphx/module.hpp"
+
+#include "migraphx/stringutils.hpp"
+namespace vitis_ai {
+
+migraphx::shape x_model::get_shape() const { return shape; };
+
+void x_model::set_shape(migraphx::shape s) { shape = s; }
+
+x_model create_xmodel(migraphx::const_module_ref mod)
+{
+    std::cout << "Calling an external function: create_xmodel!\n";
+    x_model xmodel;
+    xmodel.set_shape(migraphx::shape(mod->get_output_shapes()));
+    return xmodel;
+}
+
+migraphx::argument execute(const x_model& xmodel,
+                           const migraphx::shape& output_shape,
+                           std::vector<migraphx::argument>& args)
+{
+    (void)xmodel;
+
+    std::cout << "Calling an external function: execute!\n";
+
+    std::cout << "Output Shape: " << output_shape << std::endl;
+    std::cout << "Args: " << args.size() << std::endl;
+    for(const auto& arg : args)
+    {
+        std::cout << "  " << arg.get_shape() << std::endl;
+    }
+    std::cout << std::endl;
+
+    migraphx::argument result{output_shape};
+
+    return result;
+}
+
+} // namespace vitis_ai
diff --git a/docker/rocm/migraphx/targets/gpu/CMakeLists.txt b/docker/rocm/migraphx/targets/gpu/CMakeLists.txt
new file mode 100644
index 000000000..82cc1fb0a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/CMakeLists.txt
@@ -0,0 +1,407 @@
+# ####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ####################################################################################
+
+find_package(hip REQUIRED)
+if(NOT GPU_TARGETS)
+    set(fatal_msg "HIP package is broken and has no GPU_TARGETS. Please pass GPU_TARGETS to cmake.")
+    if(NOT WIN32)
+        set(fatal_msg "${fatal_msg}\nUse -DGPU_TARGETS=$(/opt/rocm/bin/rocminfo | grep -o -m1 'gfx.*') to build for your GPU.")
+    endif()
+    message(FATAL_ERROR ${fatal_msg})
+endif()
+
+if(MIGRAPHX_USE_MIOPEN)
+    find_package(miopen REQUIRED)
+    message(STATUS "MIGraphX is using MIOpen")
+else()
+    message(STATUS "MIGraphX is not using MIOpen")
+endif()
+
+if(MIGRAPHX_USE_ROCBLAS)
+    # rocblas
+    find_package(rocblas REQUIRED)
+    message(STATUS "MIGraphX build with rocBLAS")
+else()
+    message(STATUS "MIGraphX build without rocBLAS")
+endif()
+
+if(MIGRAPHX_USE_HIPBLASLT)
+    # hipblaslt
+    find_package(hipblaslt REQUIRED)
+    # Making hipblas required to workaround the broken hipblaslt package.
+    find_package(hipblas REQUIRED)
+    message(STATUS "MIGraphx build with hipBLAS and hipBLASLt")
+else()
+    message(STATUS "MIGraphX build without hipBLAS and hipBLASLt")
+endif()
+
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
+    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
+endif()
+
+if(BUILD_DEV)
+    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
+else()
+    set(MIGRAPHX_USE_HIPRTC ON CACHE BOOL "Use hipRTC APIs")
+endif()
+
+file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
+    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
+
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
+    list(REMOVE_ITEM KERNEL_FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
+endif()
+
+include(Embed)
+add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)
+
+configure_file(device/targets.hpp.in include/migraphx/gpu/device/targets.hpp)
+file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
+add_library(migraphx_device ${DEVICE_GPU_SRCS})
+
+add_library(compile_for_gpu INTERFACE)
+target_compile_features(compile_for_gpu INTERFACE cxx_std_17)
+target_compile_options(compile_for_gpu INTERFACE -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fnative-half-arguments-and-returns)
+target_link_options(compile_for_gpu INTERFACE  -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
+target_link_libraries(compile_for_gpu INTERFACE hip::device)
+check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
+
+if(HAS_HIP_LAMBDA_HOST_DEVICE)
+    message(STATUS "Enable -fhip-lambda-host-device")
+    target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
+endif()
+
+set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
+rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_device)
+target_link_libraries(migraphx_device PUBLIC migraphx)
+target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
+if(NOT MIGRAPHX_USE_MIOPEN AND NOT MIGRAPHX_USE_ROCBLAS)
+    target_link_libraries(migraphx_device INTERFACE hip::host)
+endif()
+target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINAR_DIR}/include>)
+target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
+target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
+migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)
+
+add_library(kernel_file_check EXCLUDE_FROM_ALL)
+
+foreach(KERNEL_FILE ${KERNEL_FILES})
+    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
+    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
+endforeach()
+
+target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
+target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_WAVEFRONTSIZE=64)
+target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
+target_link_libraries(kernel_file_check compile_for_gpu)
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
+    target_link_libraries(kernel_file_check composable_kernel::jit_library)
+endif()
+
+rocm_clang_tidy_check(kernel_file_check)
+
+file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
+    list(REMOVE_ITEM JIT_GPU_SRCS
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm_softmax_gemm.cpp)
+endif()
+
+if(MIGRAPHX_USE_MIOPEN)
+    set(MIOPEN_SRCS abs.cpp)
+endif()
+
+add_library(migraphx_gpu
+    analyze_streams.cpp
+    allocation_model.cpp
+    argmax.cpp
+    argmin.cpp
+    code_object_op.cpp
+    compile_ops.cpp
+    compile_gen.cpp
+    compile_hip.cpp
+    compile_hip_code_object.cpp
+    compile_hipblaslt.cpp
+    compile_miopen.cpp
+    compile_pointwise.cpp
+    compiler.cpp
+    device_name.cpp
+    fuse_ck.cpp
+    fuse_mlir.cpp
+    fuse_ops.cpp
+    gemm_impl.cpp
+    hip.cpp
+    hipblaslt.cpp
+    hip_gemm_impl.cpp
+    kernel.cpp
+    lowering.cpp
+    logsoftmax.cpp
+    loop.cpp
+    lrn.cpp
+    mlir.cpp
+    multinomial.cpp
+    no_device.cpp
+    nonzero.cpp
+    pack_args.cpp
+    prefuse_ops.cpp
+    prepare_reduce.cpp
+    perfdb.cpp
+    pooling.cpp
+    problem_cache.cpp
+    reverse.cpp
+    rnn_variable_seq_lens.cpp
+    rocblas.cpp
+    schedule_model.cpp
+    sync_device.cpp
+    target.cpp
+    time_op.cpp
+    topk.cpp
+    write_literals.cpp
+    ${JIT_GPU_SRCS}
+    ${MIOPEN_SRCS}
+)
+
+set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
+migraphx_generate_export_header(migraphx_gpu)
+
+function(register_migraphx_gpu_ops PREFIX)
+    foreach(OP ${ARGN})
+        register_op(migraphx_gpu HEADER migraphx/gpu/${OP}.hpp OPERATORS gpu::${PREFIX}${OP} INCLUDES migraphx/gpu/context.hpp)
+    endforeach()
+endfunction()
+
+register_migraphx_gpu_ops(hip_
+    argmax
+    argmin
+    logsoftmax
+    loop
+    multinomial
+    nonzero
+    prefix_scan_sum
+    reverse
+    topk
+)
+if (MIGRAPHX_USE_MIOPEN)
+register_migraphx_gpu_ops(miopen_
+    abs
+    contiguous
+    lrn
+    pooling
+)
+else()
+register_migraphx_gpu_ops(miopen_
+    contiguous
+)
+endif()
+register_op(migraphx_gpu
+    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
+    OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
+    INCLUDES migraphx/gpu/context.hpp)
+if(MIGRAPHX_USE_ROCBLAS)
+    register_op(migraphx_gpu
+        HEADER migraphx/gpu/gemm.hpp
+        OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
+        INCLUDES migraphx/gpu/context.hpp)
+endif()
+if(MIGRAPHX_USE_HIPBLASLT)
+    register_op(migraphx_gpu
+        HEADER migraphx/gpu/hip_gemm.hpp
+        OPERATORS gpu::hip_gemm<op::dot> gpu::hip_gemm<op::quant_dot>
+        INCLUDES migraphx/gpu/context.hpp)
+endif()
+if (MIGRAPHX_USE_MIOPEN)
+    register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
+        OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::convolution_backwards> gpu::miopen_convolution<op::quant_convolution>
+        INCLUDES migraphx/gpu/context.hpp)
+endif()
+rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_gpu)
+
+set(MIGRAPHX_ENABLE_MLIR ON CACHE BOOL "")
+
+if(MIGRAPHX_ENABLE_MLIR)
+    # Find package rocMLIR
+    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
+    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
+    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
+    # Make this private to avoid multiple inclusions of LLVM symbols.
+    # TODO: Fix rocMLIR's library to hide LLVM internals.
+    target_link_libraries(migraphx_gpu PRIVATE rocMLIR::rockCompiler)
+endif()
+
+if(MIGRAPHX_USE_HIPRTC)
+    find_package(hiprtc REQUIRED)
+    message(STATUS "MIGraphX is using hipRTC")
+    target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
+    target_link_libraries(migraphx_gpu PUBLIC hiprtc::hiprtc)
+else()
+    message(STATUS "MIGraphX is using HIP Clang")
+
+    # Get flags needed to compile hip
+    include(TargetFlags)
+    target_flags(HIP_COMPILER_FLAGS hip::device)
+
+    # Remove cuda arch flags
+    string(REGEX REPLACE "--cuda-gpu-arch=[a-z0-9]+ ?" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    string(REGEX REPLACE "--offload-arch=[a-z0-9:+-]+ ?" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+
+    # Skip library paths since hip will incorrectly treat it as a source file
+    string(APPEND HIP_COMPILER_FLAGS " ")
+
+    if(WIN32)
+        string(REPLACE "\\" "/" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    endif()
+    foreach(_unused RANGE 2)
+        string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    endforeach()
+
+    message(STATUS "Hip compiler flags: \"${HIP_COMPILER_FLAGS}\"")
+    target_compile_definitions(migraphx_gpu PRIVATE
+        -DMIGRAPHX_HIP_COMPILER="${CMAKE_CXX_COMPILER}"
+        -DMIGRAPHX_HIP_COMPILER_FLAGS="${HIP_COMPILER_FLAGS}"
+    )
+
+    if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
+        if(WIN32)
+            execute_process(COMMAND where ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        else()
+            execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        endif()
+        string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER="${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+    endif()
+endif()
+
+target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_CXX_COMPILER="${CMAKE_CXX_COMPILER}")
+
+# Check miopen find mode api
+
+include(CheckLibraryExists)
+if (MIGRAPHX_USE_MIOPEN)
+    get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
+    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_MIOPEN=1)
+    check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
+    check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+else()
+target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_MIOPEN=0)
+endif()
+
+if(MIGRAPHX_USE_ROCBLAS)
+    get_target_property(ROCBLAS_LOCATION roc::rocblas LOCATION)
+    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_ROCBLAS=1)
+    # Beta API for automated GEMM tuning
+    check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
+    # rocblas FP8 API
+    check_library_exists(roc::rocblas "rocblas_gemm_strided_batched_ex3" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_FP8_BETA_API)
+else()
+    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_ROCBLAS=0)
+endif()
+
+if(MIGRAPHX_USE_HIPBLASLT)
+    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_HIPBLASLT=1)
+else()
+    target_compile_definitions(migraphx_gpu PUBLIC MIGRAPHX_USE_HIPBLASLT=0)
+endif()
+
+if(MIGRAPHX_USE_MIOPEN)
+    set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")
+
+    if(MIGRAPHX_USE_FIND_2_API)
+        check_library_exists(MIOpen "miopenSetFindOptionPreallocatedTensor" "${MIOPEN_LOCATION}" HAS_PREALLOCATION_API)
+        if(HAS_PREALLOCATION_API)
+            target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API -DMIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS)
+        else()
+            target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+        endif()
+        message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+    else()
+        message(STATUS "MIGraphx is using legacy Find API in MIOpen")
+    endif()
+
+    if(HAS_FIND_MODE_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
+        message(STATUS "MIGraphx is using Find Mode API of MIOpen")
+    else()
+        message(STATUS "MIOpen does not have find mode api")
+    endif()
+
+    target_link_libraries(migraphx_gpu PUBLIC MIOpen)
+endif()
+
+if(MIGRAPHX_USE_ROCBLAS)
+    if(HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
+        message(STATUS "MIGraphx is using Beta API of rocBLAS")
+    else()
+        message(STATUS "rocBLAS does not have User Tuning Beta API")
+    endif()
+
+    if(HAS_ROCBLAS_FP8_BETA_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_FP8_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
+        message(STATUS "MIGraphX is using Beta API of rocBLAS for FP8 computations")
+    else()
+        message(STATUS "rocBLAS does not have Fp8 Beta API")
+    endif()
+
+
+    target_link_libraries(migraphx_gpu PUBLIC roc::rocblas)
+endif()
+
+if(MIGRAPHX_USE_HIPBLASLT)
+    target_link_libraries(migraphx_gpu PUBLIC roc::hipblaslt)
+endif()
+
+if(WIN32)
+    # Temporary workaround on rocMLIR not exporting correctly libraries it depends on.
+    target_link_libraries(migraphx_gpu PRIVATE ntdll)
+endif()
+
+target_link_libraries(migraphx_gpu PUBLIC migraphx)
+if(NOT MIGRAPHX_USE_MIOPEN AND NOT MIGRAPHX_USE_ROCBLAS)
+    target_link_libraries(migraphx_gpu PUBLIC migraphx_device)
+else()
+    target_link_libraries(migraphx_gpu PRIVATE migraphx_device)
+endif()
+target_link_libraries(migraphx_gpu PRIVATE migraphx_kernels)
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
+    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
+    target_compile_definitions(migraphx_gpu PRIVATE MIGRAPHX_USE_COMPOSABLEKERNEL=1)
+endif()
+
+add_subdirectory(driver)
+add_subdirectory(hiprtc)
+
+rocm_install_targets(
+    PRIVATE
+    TARGETS migraphx_gpu migraphx_device compile_for_gpu
+    INCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
diff --git a/docker/rocm/migraphx/targets/gpu/abs.cpp b/docker/rocm/migraphx/targets/gpu/abs.cpp
new file mode 100644
index 000000000..8cd0a1d8b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/abs.cpp
@@ -0,0 +1,61 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/abs.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_MIOPEN
+shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).packed();
+    return inputs.at(0);
+}
+
+argument miopen_abs::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    float alpha = 1;
+    float beta  = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenActivationForward(ctx.get_stream().get_miopen(),
+                            ad.get(),
+                            &alpha,
+                            x_desc.get(),
+                            args[0].implicit(),
+                            &beta,
+                            y_desc.get(),
+                            args[1].implicit());
+
+    return args[1];
+}
+
+void miopen_abs::finalize(context&, const shape&, const std::vector<shape>&) { ad = make_abs(); }
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/allocation_model.cpp b/docker/rocm/migraphx/targets/gpu/allocation_model.cpp
new file mode 100644
index 000000000..e5fd2cc27
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/allocation_model.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/module.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::string gpu_allocation_model::name() const { return "hip::allocate"; }
+operation gpu_allocation_model::allocate(const shape& s) const
+{
+    return make_op(name(), {{"shape", to_value(s)}});
+}
+
+operation gpu_allocation_model::preallocate(const shape& s, const std::string& id) const
+{
+    return make_op("hip::hip_allocate_memory", {{"shape", to_value(s)}, {"id", id}});
+}
+
+std::string gpu_allocation_model::copy() const { return "hip::copy"; }
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/analyze_streams.cpp b/docker/rocm/migraphx/targets/gpu/analyze_streams.cpp
new file mode 100644
index 000000000..e08c89d82
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/analyze_streams.cpp
@@ -0,0 +1,82 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/analyze_streams.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct hip_stream_model
+{
+    std::size_t max_stream = 0;
+    std::unordered_map<migraphx::instruction_ref, std::size_t> ins2stream{};
+    std::size_t get_nstream() const { return max_stream + 1; }
+    std::size_t get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
+    std::size_t get_event_id(migraphx::instruction_ref ins) const
+    {
+        auto v = ins->get_operator().to_value();
+        return v["event"].to<std::size_t>();
+    }
+    bool has_stream(migraphx::instruction_ref ins) const { return ins2stream.count(ins) > 0; }
+    bool is_record(migraphx::instruction_ref ins) const
+    {
+        return ins->name() == "gpu::record_event";
+    }
+    bool is_wait(migraphx::instruction_ref ins) const { return ins->name() == "gpu::wait_event"; }
+};
+
+stream_model make_stream_model(const module& m)
+{
+    hip_stream_model hsm;
+    std::size_t stream = 0;
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() == "gpu::set_stream")
+        {
+            auto v         = ins->get_operator().to_value();
+            stream         = v["stream"].to<std::size_t>();
+            hsm.max_stream = std::max(stream, hsm.max_stream);
+        }
+        if(ins->get_operator().is_context_free())
+            continue;
+        if(contains({"hip::hip_allocate_memory", "hip::hip_copy_literal", "@param"}, ins->name()))
+            continue;
+        hsm.ins2stream[ins] = stream;
+    }
+    return hsm;
+}
+
+std::vector<stream_race> analyze_streams(const module& m)
+{
+    return migraphx::analyze_streams(m, make_stream_model(m));
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/argmax.cpp b/docker/rocm/migraphx/targets/gpu/argmax.cpp
new file mode 100644
index 000000000..b5f720295
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/argmax.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2);
+    return op.normalize_compute_shape({inputs.at(0)});
+}
+
+argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto n_dim         = args.front().get_shape().lens().size();
+    int64_t tuned_axis = tune_axis(n_dim, op.axis, op.name());
+    device::argmax(
+        ctx.get_stream().get(), args.back(), args.front(), tuned_axis, op.select_last_index);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/argmin.cpp b/docker/rocm/migraphx/targets/gpu/argmin.cpp
new file mode 100644
index 000000000..02c44e29f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/argmin.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2);
+    return op.normalize_compute_shape({inputs.at(0)});
+}
+
+argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto n_dim         = args.front().get_shape().lens().size();
+    int64_t tuned_axis = tune_axis(n_dim, op.axis, op.name());
+    device::argmin(
+        ctx.get_stream().get(), args.back(), args.front(), tuned_axis, op.select_last_index);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/code_object_op.cpp b/docker/rocm/migraphx/targets/gpu/code_object_op.cpp
new file mode 100644
index 000000000..98a580dc4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/code_object_op.cpp
@@ -0,0 +1,67 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_REGISTER_OP(code_object_op);
+
+shape code_object_op::compute_shape(std::vector<shape> inputs) const
+{
+    std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](const shape& s) {
+        return s.normalize_standard();
+    });
+    auto einputs = expected_inputs;
+    std::transform(einputs.begin(), einputs.end(), einputs.begin(), [](const shape& s) {
+        return s.normalize_standard();
+    });
+    if(not migraphx::equal(flatten(einputs), flatten(inputs), &shape::is_compatible))
+        MIGRAPHX_THROW("Input shapes have changed: [" + to_string_range(einputs) + "] -> [" +
+                       to_string_range(inputs) + "]");
+    return output;
+}
+argument
+code_object_op::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto fargs = flatten(args);
+    std::vector<void*> kargs(fargs.size());
+    std::transform(
+        fargs.begin(), fargs.end(), kargs.begin(), [](const argument& a) { return a.data(); });
+    auto [start, stop] = ctx.get_perf_events();
+    k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
+    return args[get_output_arg(args.size())];
+}
+void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
+{
+    assert(not code_object.empty());
+    k = kernel(code_object, symbol_name);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_gen.cpp b/docker/rocm/migraphx/targets/gpu/compile_gen.cpp
new file mode 100644
index 000000000..82ff3c4a2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_gen.cpp
@@ -0,0 +1,576 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/prepare_reduce.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/rewrite_quantization.hpp>
+#include <migraphx/optimize_module.hpp>
+#include <migraphx/cpp_generator.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/array.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/fp8_types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace gen {
+
+static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
+{
+    // If all inputs are half then only use half2
+    if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
+           return s.type() == shape::half_type;
+       }))
+        return {2};
+    return {4, 2};
+}
+
+vectorize vectorize::elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes)
+{
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return contains(fp8_types{}.get(), ishape.type());
+       }))
+        return {1, axis};
+    if(std::all_of(
+           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
+        return {1, axis};
+    std::vector<std::size_t> max_vec_size;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(max_vec_size),
+                   [&](const auto& input) -> std::size_t {
+                       auto stride = input.strides()[axis];
+                       auto len    = input.lens()[axis];
+                       if(not contains({0, 1}, stride))
+                           return 1;
+                       if(len == 1 and input.elements() > sizes.front())
+                           return sizes.front();
+                       auto it = std::find_if(sizes.begin(), sizes.end(), [&](auto vsize) {
+                           // The len is divisible by the size and all the strides are divisible by
+                           // the size
+                           return (len % vsize) == 0 and
+                                  std::all_of(
+                                      input.strides().begin(), input.strides().end(), [&](auto i) {
+                                          return contains({0, 1}, i) or i % vsize == 0;
+                                      });
+                       });
+                       if(it != sizes.end())
+                           return *it;
+                       return 1;
+                   });
+    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
+}
+
+vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
+{
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return contains(fp8_types{}.get(), ishape.type());
+       }))
+        return {1, axis};
+    if(inputs.empty())
+        return {1, axis};
+    std::size_t n = std::max_element(inputs.begin(),
+                                     inputs.end(),
+                                     by(std::less<>{}, [](const auto& s) { return s.elements(); }))
+                        ->elements();
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    std::size_t over = n / max_global;
+    bool broadcasted =
+        std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); });
+    std::vector<std::size_t> sizes;
+    if(broadcasted and over > 8)
+        sizes.push_back(8);
+    if(over > 4)
+        sizes.push_back(4);
+    sizes.push_back(2);
+    return elements(axis, inputs, sizes);
+}
+
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    return elements(axis, inputs, vector_sizes(inputs));
+}
+
+std::string vectorize::str() const
+{
+    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
+}
+
+preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
+{
+    const std::size_t max_lds_bytes = 4096;
+    std::vector<bool> result(inputs.size());
+    std::vector<std::size_t> preloaded;
+    auto idxs = range(inputs.size());
+    std::copy_if(idxs.begin(), idxs.end(), std::back_inserter(preloaded), [&](auto i) {
+        return inputs[i].strides()[axis] == 0;
+    });
+    std::sort(preloaded.begin(), preloaded.end(), by(std::less<>{}, [&](auto i) {
+                  return inputs[i].bytes();
+              }));
+
+    std::size_t bytes = 0;
+    for(auto i : preloaded)
+    {
+        const auto& input = inputs[i];
+        bytes += input.bytes();
+        if(bytes > max_lds_bytes)
+            break;
+        result[i] = true;
+    }
+    return {result};
+}
+
+std::string preload::str() const
+{
+    std::vector<std::string> bool_strs;
+    std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
+        if(b)
+            return "true";
+        return "false";
+    });
+    return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
+}
+
+bool preload::is_preloading() const
+{
+    return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
+}
+
+static std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
+{
+    return (x + y - std::size_t{1}) / y;
+}
+
+static std::size_t compute_tile_factor(std::size_t r, std::size_t max_size = 64)
+{
+    std::size_t n = 1;
+    auto factors  = make_array(2, 3, 5, 7, 11);
+    while(n < max_size)
+    {
+        // NOLINTNEXTLINE(readability-qualified-auto)
+        auto it = std::find_if(factors.begin(), factors.end(), [&](auto d) { return r % d == 0; });
+        if(it == factors.end())
+            break;
+        r /= *it;
+        n *= *it;
+    }
+    return n;
+}
+
+tile tile::elements(const std::vector<shape>& inputs, std::size_t noutputs)
+{
+    tile result;
+    auto ndim = inputs.front().ndim();
+    std::vector<std::size_t> faxes;
+    std::transform(
+        inputs.begin(), inputs.end(), std::back_inserter(faxes), MIGRAPHX_LIFT(find_fast_axis));
+    result.axis = std::accumulate(faxes.begin(), faxes.end(), ndim, MIGRAPHX_LIFT(std::min));
+    if(result.axis >= (ndim - 1))
+        return {};
+    auto select = [&](auto m) {
+        return [&, m](std::size_t faxis, shape input) {
+            if(input.broadcasted())
+                return none;
+            if(faxis < (ndim - 1))
+                return m;
+            return none;
+        };
+    };
+    std::transform(faxes.begin(),
+                   faxes.end() - noutputs,
+                   inputs.begin(),
+                   std::back_inserter(result.args),
+                   select(load));
+    std::transform(faxes.end() - noutputs,
+                   faxes.end(),
+                   inputs.end() - noutputs,
+                   std::back_inserter(result.args),
+                   select(store));
+
+    auto nargs = std::count_if(
+        result.args.begin(), result.args.end(), [](auto m) { return m != mode::none; });
+    // TODO: Handle tiling more than one arguments
+    if(nargs != 1)
+        return {};
+
+    const auto& s = inputs.front();
+    auto dim1     = compute_tile_factor(s.lens()[result.axis]);
+    auto dim2     = compute_tile_factor(s.lens().back(), 4096 / dim1);
+    if(dim1 == 1 or dim2 == 1)
+        return {};
+
+    result.inner = s.lens();
+    std::fill(result.inner.begin(), result.inner.end(), 1);
+    result.inner[result.axis] = dim1;
+    result.inner.back()       = dim2;
+
+    result.outer = s.lens();
+    result.outer[result.axis] /= dim1;
+    result.outer.back() /= dim2;
+
+    auto tile_size = dim1 * dim2;
+    result.ntiles  = s.elements() / tile_size;
+    // equivalent to dim1 * (dim2 + 1) to avoid bank conflicts
+    auto tile_bytes = (tile_size + dim1) * s.type_size();
+    if(tile_bytes > 65536)
+        return {};
+
+    result.block_size = std::min<std::size_t>(256, integer_divide_ceil(tile_size / 4, 64) * 64);
+    return result;
+}
+
+std::string tile::str() const
+{
+    if(args.empty())
+        return "transform_args()";
+    std::vector<std::string> strs;
+    std::transform(args.begin(), args.end(), std::back_inserter(strs), [](mode m) {
+        switch(m)
+        {
+        case load: return "tile::load";
+        case store: return "tile::store";
+        case none: return "tile::none";
+        }
+        MIGRAPHX_THROW("Invalid mode");
+    });
+    const std::string auto_tile = "auto_tile<${modes}>(${inner}, ${outer})";
+    return interpolate_string(auto_tile,
+                              {{"modes", join_strings(strs, ", ")},
+                               {"inner", generate_index_ints(inner)},
+                               {"outer", generate_index_ints(outer)}});
+}
+
+std::size_t find_fast_axis(const shape& input)
+{
+    if(input.scalar())
+        return input.ndim() - 1;
+    if(input.broadcasted())
+    {
+        auto stride_it = std::min_element(
+            input.strides().begin(), input.strides().end(), by(std::less<>{}, [](std::size_t i) {
+                if(i == 0)
+                    return std::numeric_limits<std::size_t>::max();
+                return i;
+            }));
+        return stride_it - input.strides().begin();
+    }
+    auto permutation = invert_permutation(find_permutation(input));
+    auto it          = std::max_element(permutation.begin(), permutation.end());
+    return it - permutation.begin();
+}
+
+std::size_t find_fast_axis(const std::vector<shape>& inputs)
+{
+    auto permutation = invert_permutation(find_permutation(inputs));
+    auto it          = std::max_element(permutation.begin(), permutation.end());
+    return it - permutation.begin();
+}
+
+std::string make_transformer_args(std::vector<std::string> transformers)
+{
+    return join_strings(std::move(transformers), ", ");
+}
+
+static void generate_pointwise(cpp_generator& gg,
+                               const module& pm,
+                               const std::string& name,
+                               bool always_return_tuple = false)
+{
+    module m = pm;
+    run_passes(m, {rewrite_quantization{}, optimize_module{}});
+    m.sort();
+    cpp_generator g;
+    g.always_return_tuple(always_return_tuple);
+    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
+    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
+    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
+    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
+    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
+    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
+    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
+    g.add_point_op("not", "migraphx::abs(not ${0})");
+    // Add explict conversions
+    g.fresult(
+        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
+    gg.create_function(g.generate_module(m)
+                           .set_attributes({"__device__", "__attribute__((const))"})
+                           .set_generic_types(m)
+                           .set_name(name));
+}
+std::string generate_pointwise(const module& pm, const std::string& name, bool always_return_tuple)
+{
+    cpp_generator g;
+    generate_pointwise(g, pm, name, always_return_tuple);
+    return g.str();
+}
+
+std::string reduce_op::str() const
+{
+    return write + "(r.reduce(" + reduction + ", " + init + ", " + read + ")(" +
+           join_strings(inputs, ", ") + "))";
+}
+void reduce_op::set(const std::string& name, const shape& input, const shape& output)
+{
+    assert(input.type() != shape::tuple_type);
+    assert(output.type() != shape::tuple_type);
+    if(name == "reduce_sum")
+    {
+        reduction = "op::sum{}";
+    }
+    else if(name == "reduce_mean")
+    {
+        auto reduce_elements = input.elements() / output.elements();
+        auto reduce_type     = input.type();
+        reduction            = "op::sum{}";
+        std::string mean     = "op::mean<" + std::to_string(reduce_elements) + ">{}";
+        // Use float accumulator when reduction size is too large for half
+        if(reduce_type == shape::half_type and reduce_elements > 16384)
+            read = "compose(" + mean + ", op::convert_to<float>{})";
+        else if(contains({shape::float_type, shape::half_type, shape::double_type}, reduce_type))
+            read = mean;
+        else
+            write = mean;
+    }
+    else if(name == "reduce_max")
+    {
+        reduction = "op::max{}";
+        init      = "lowest{}";
+    }
+    else if(name == "reduce_min")
+    {
+        reduction = "op::min{}";
+        init      = "highest{}";
+    }
+    else if(name == "reduce_prod")
+    {
+        reduction = "op::product{}";
+        init      = "1";
+    }
+    else if(name == "reduce_any")
+    {
+        reduction = "op::logical_or{}";
+        init      = "bool{false}";
+    }
+    else if(name == "reduce_all")
+    {
+        reduction = "op::logical_and{}";
+        init      = "bool{true}";
+    }
+    else
+    {
+        MIGRAPHX_THROW("Unsupported reduce");
+    }
+}
+
+void reduce_op::set(instruction_ref ins, const operation& op)
+{
+    if(op.name() == "gpu::parallel_reduce")
+    {
+        auto rop    = from_value<operation>(op.to_value().at("op"));
+        auto input  = ins->inputs().front()->get_shape();
+        auto output = ins->get_shape().sub_shapes().front();
+        set(rop.name(), input, output);
+        read = "compose(array_apply(" + read + "), MIGRAPHX_LIFT(make_array))";
+    }
+    else
+    {
+        set(op.name(), ins->inputs().front()->get_shape(), ins->get_shape());
+    }
+}
+std::string reduce_op::generate(instruction_ref ins, const std::vector<std::string>& x)
+{
+    reduce_op r{x};
+    r.set(ins, ins->get_operator());
+    return r.str();
+}
+
+static bool use_lazy_inner(instruction_ref ins)
+{
+    if(ins->outputs().size() != 1)
+        return false;
+    // When the inputs are broadcasted, it means the lambda will capture SGPRs
+    // when doing block/wave reduction. This can cause register spilling in
+    // the compiler when the lambda is evaluated at a later time although it
+    // shouldn't. Instead, use `inner` to workaround this issue in the
+    // compiler.
+    if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](instruction_ref input) {
+           return input->get_shape().broadcasted();
+       }))
+        return false;
+    auto output = ins->outputs().front();
+    return contains(output->name(), "reduce") or output->name() == "@return";
+}
+
+void preload_params(module& m)
+{
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "@param")
+            continue;
+        if(ins->outputs().size() <= 1)
+            continue;
+        auto id = m.insert_instruction(std::next(ins), make_op("identity"), ins);
+        m.replace_instruction(ins, id);
+    }
+}
+
+std::string generate_reduce(module m, const std::string& name)
+{
+    preload_params(m);
+    run_passes(m, {optimize_module{}, prepare_reduce{}, optimize_module{}});
+    m.sort();
+    cpp_generator g;
+    g.always_return_tuple();
+    auto param_shapes = m.get_parameter_shapes();
+    auto max_shape =
+        std::max_element(param_shapes.begin(),
+                         param_shapes.end(),
+                         by(std::less<>{}, [](const auto& p) { return p.second.elements(); }));
+    auto ilens    = max_shape->second.lens();
+    std::size_t i = 0;
+    auto f        = g.generate_module(m, [&](instruction_ref ins, const auto& names) {
+        if(contains(ins->name(), "reduce"))
+        {
+            return reduce_op::generate(ins, cpp_generator::to_args(ins->inputs(), names));
+        }
+        if(ins->name() == "pointwise")
+        {
+            auto pointwise_name = "pointwise" + std::to_string(i);
+            i++;
+            generate_pointwise(g, *ins->module_inputs().front(), pointwise_name);
+            std::vector<instruction_ref> tensors;
+            std::copy_if(ins->inputs().begin(),
+                         ins->inputs().end(),
+                         std::back_inserter(tensors),
+                         [&](auto input) {
+                             return input->get_shape().lens() == ilens and
+                                    not input->get_shape().broadcasted();
+                         });
+            auto inner_names = names;
+            for(auto input : ins->inputs())
+            {
+                if(input->name() != "@param")
+                    continue;
+                if(contains(tensors, input))
+                    continue;
+                inner_names[input] += "[out_idx]";
+            }
+            for(auto input : tensors)
+                inner_names[input] += "_lambda_param";
+            auto call_function =
+                pointwise_name + "(" +
+                join_strings(cpp_generator::to_args(ins->inputs(), inner_names), ", ") + ")";
+            if(tensors.empty())
+                return call_function;
+            const std::string inner_template =
+                "r.${inner}([=](${params}) { return ${call}; })(${args})";
+            std::string inner_name = use_lazy_inner(ins) ? "lazy_inner" : "inner";
+            auto args              = cpp_generator::to_args(tensors, names);
+            auto params            = cpp_generator::to_args(tensors, inner_names);
+            std::transform(
+                params.begin(), params.end(), params.begin(), [](auto s) { return "auto " + s; });
+            return interpolate_string(inner_template,
+                                      {{"inner", inner_name},
+                                       {"params", join_strings(params, ", ")},
+                                       {"args", join_strings(args, ", ")},
+                                       {"call", call_function}});
+        }
+        if(ins->name() == "multibroadcast")
+        {
+            return names.at(ins->inputs().front());
+        }
+        if(ins->name() == "get_tuple_elem")
+        {
+            const auto& x = names.at(ins->inputs().front());
+            auto index    = ins->get_operator().to_value()["index"].to<std::size_t>();
+            return interpolate_string("${x}[${index}]",
+                                      {{"x", x}, {"index", std::to_string(index)}});
+        }
+        if(ins->name() == "identity")
+        {
+            const auto& x = names.at(ins->inputs().front());
+            return "r.inner(op::id{})(" + x + ")";
+        }
+        MIGRAPHX_THROW("Unknown operator: " + ins->name());
+    });
+    f.set_attributes({"__device__", "__attribute__((const))"}).set_generic_types(m).set_name(name);
+    f.add_generic_param("r");
+    f.add_generic_param("out_idx");
+    f.unused_param("out_idx");
+    g.create_function(f);
+    return g.str();
+}
+
+static std::vector<std::string> get_op_names(const module& m)
+{
+    std::vector<std::string> result;
+    for(auto& ins : m)
+    {
+        if(starts_with(ins.name(), "@"))
+            continue;
+        if(contains({"multibroadcast", "contiguous", "identity"}, ins.name()))
+            continue;
+        if(ins.name() == "pointwise")
+        {
+            auto names = get_op_names(*ins.module_inputs().front());
+            result.insert(result.end(), names.begin(), names.end());
+        }
+        else
+        {
+            result.push_back(ins.name());
+        }
+    }
+    return result;
+}
+
+std::string generate_name_from_ops(const module& m, const std::string& postname)
+{
+    auto op_names = get_op_names(m);
+    if(not postname.empty())
+        op_names.push_back(postname);
+    if(op_names.empty())
+        return "noop";
+    return join_strings(op_names, "_");
+}
+
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_hip.cpp b/docker/rocm/migraphx/targets/gpu/compile_hip.cpp
new file mode 100644
index 000000000..58b518725
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_hip.cpp
@@ -0,0 +1,406 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/errors.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/fileutils.hpp>
+#include <cassert>
+#include <iostream>
+#include <deque>
+
+#ifdef MIGRAPHX_USE_HIPRTC
+#include <hip/hiprtc.h>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/tmp_dir.hpp>
+#include <migraphx/dynamic_loader.hpp>
+#include <migraphx/process.hpp>
+#include <migraphx/msgpack.hpp>
+#include <migraphx/serialize.hpp>
+#include <migraphx/file_buffer.hpp>
+#else
+#include <migraphx/compile_src.hpp>
+#include <migraphx/process.hpp>
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG_SYM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
+
+#ifdef MIGRAPHX_USE_HIPRTC
+
+std::string hiprtc_error(hiprtcResult err, const std::string& msg)
+{
+    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
+}
+
+void hiprtc_check_error(hiprtcResult err, const std::string& msg, const std::string& ctx)
+{
+    if(err != HIPRTC_SUCCESS)
+        throw make_exception(ctx, hiprtc_error(err, msg));
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIPRTC(...) \
+    hiprtc_check_error(__VA_ARGS__, #__VA_ARGS__, MIGRAPHX_MAKE_SOURCE_CTX())
+
+#define MIGRAPHX_HIPRTC_THROW(error, msg) MIGRAPHX_THROW(hiprtc_error(error, msg))
+
+// Workaround hiprtc's broken API
+void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
+using hiprtc_program_ptr = MIGRAPHX_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
+
+template <class... Ts>
+hiprtc_program_ptr hiprtc_program_create(Ts... xs)
+{
+    hiprtcProgram prog = nullptr;
+    auto result        = hiprtcCreateProgram(&prog, xs...);
+    hiprtc_program_ptr p{prog};
+    if(result != HIPRTC_SUCCESS)
+        MIGRAPHX_HIPRTC_THROW(result, "Create program failed.");
+    return p;
+}
+
+struct hiprtc_program
+{
+    struct string_array
+    {
+        std::deque<std::string> strings{};
+        std::vector<const char*> c_strs{};
+
+        string_array() {}
+        string_array(const string_array&) = delete;
+
+        std::size_t size() const { return strings.size(); }
+
+        const char** data() { return c_strs.data(); }
+
+        void push_back(std::string s)
+        {
+            strings.push_back(std::move(s));
+            c_strs.push_back(strings.back().c_str());
+        }
+    };
+
+    hiprtc_program_ptr prog = nullptr;
+    string_array headers{};
+    string_array include_names{};
+    std::string cpp_src  = "";
+    std::string cpp_name = "";
+
+    hiprtc_program(const std::string& src, const std::string& name = "main.cpp")
+        : cpp_src(src), cpp_name(name)
+    {
+        create_program();
+    }
+
+    hiprtc_program(std::vector<hiprtc_src_file> srcs)
+    {
+        for(auto&& src : srcs)
+        {
+            if(ends_with(src.path, ".cpp"))
+            {
+                cpp_src  = std::move(src.content);
+                cpp_name = std::move(src.path);
+            }
+            else
+            {
+                headers.push_back(std::move(src.content));
+                include_names.push_back(std::move(src.path));
+            }
+        }
+        create_program();
+    }
+
+    void create_program()
+    {
+        assert(not cpp_src.empty());
+        assert(not cpp_name.empty());
+        assert(headers.size() == include_names.size());
+        prog = hiprtc_program_create(cpp_src.c_str(),
+                                     cpp_name.c_str(),
+                                     headers.size(),
+                                     headers.data(),
+                                     include_names.data());
+    }
+
+    void compile(const std::vector<std::string>& options, bool quiet = false) const
+    {
+        if(enabled(MIGRAPHX_TRACE_HIPRTC{}))
+            std::cout << "hiprtc " << join_strings(options, " ") << " " << cpp_name << std::endl;
+        std::vector<const char*> c_options;
+        std::transform(options.begin(),
+                       options.end(),
+                       std::back_inserter(c_options),
+                       [](const std::string& s) { return s.c_str(); });
+        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        auto prog_log = log();
+        if(not prog_log.empty() and not quiet)
+        {
+            std::cerr << prog_log << std::endl;
+        }
+        if(result != HIPRTC_SUCCESS)
+            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
+    }
+
+    std::string log() const
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
+        if(n == 0)
+            return {};
+        std::string buffer(n, '\0');
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
+        assert(buffer.back() != 0);
+        return buffer;
+    }
+
+    std::vector<char> get_code_obj() const
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
+        return buffer;
+    }
+};
+
+std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
+                                                           const std::vector<std::string>& params,
+                                                           const std::string& arch)
+{
+    hiprtc_program prog(std::move(srcs));
+    auto options = params;
+    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    if(enabled(MIGRAPHX_GPU_DEBUG{}))
+        options.push_back("-DMIGRAPHX_DEBUG");
+    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
+           return starts_with(s, "--std=") or starts_with(s, "-std=");
+       }))
+        options.push_back("-std=c++17");
+    options.push_back("-fno-gpu-rdc");
+    options.push_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
+    options.push_back("-Wno-cuda-compat");
+    options.push_back("--offload-arch=" + arch);
+    prog.compile(options);
+    return {prog.get_code_obj()};
+}
+
+bool hip_has_flags(const std::vector<std::string>& flags)
+{
+    hiprtc_program prog{" "};
+
+    std::string src = " ";
+    src_file input{"main.cpp", src};
+    std::vector<src_file> srcs = {input};
+
+    try
+    {
+        std::string arch = "gfx900";
+        compile_hip_src(srcs, flags, arch);
+        return true;
+    }
+    catch(...)
+    {
+        return false;
+    }
+}
+
+std::vector<std::vector<char>> compile_hip_src(const std::vector<src_file>& srcs,
+                                               const std::vector<std::string>& params,
+                                               const std::string& arch)
+{
+    std::vector<hiprtc_src_file> hsrcs{srcs.begin(), srcs.end()};
+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content) << std::endl;
+        }
+    }
+
+    auto fname  = make_executable_filename("migraphx-hiprtc-driver");
+    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
+    auto driver = p.parent_path() / fname;
+
+    bool found = fs::exists(driver);
+    if(not found)
+    {
+        driver = p.parent_path().parent_path() / "bin" / fname;
+        found  = fs::exists(driver);
+    }
+
+    if(found)
+    {
+        value v;
+        v["srcs"]   = to_value(hsrcs);
+        v["params"] = to_value(params);
+        v["arch"]   = to_value(arch);
+
+        tmp_dir td{};
+        auto out = td.path / "output";
+
+        process(driver, {quote_string(out.string())}).write([&](auto writer) {
+            to_msgpack(v, writer);
+        });
+        if(fs::exists(out))
+            return {read_buffer(out)};
+    }
+    return compile_hip_src_with_hiprtc(std::move(hsrcs), params, arch);
+}
+
+#else // MIGRAPHX_USE_HIPRTC
+
+std::vector<std::vector<char>>
+compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file>,    // NOLINT
+                            const std::vector<std::string>&, // NOLINT
+                            const std::string&)
+{
+    MIGRAPHX_THROW("Not using hiprtc");
+}
+
+bool is_hip_clang_compiler()
+{
+    static const auto result = fs::path{MIGRAPHX_HIP_COMPILER}.stem() == "clang++";
+    return result;
+}
+
+#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
+
+bool has_compiler_launcher()
+{
+    static const auto result = fs::exists(MIGRAPHX_HIP_COMPILER_LAUNCHER);
+    return result;
+}
+
+#endif
+
+src_compiler assemble(src_compiler compiler)
+{
+    compiler.out_ext = ".S";
+    std::replace(compiler.flags.begin(), compiler.flags.end(), "-c", "-S");
+    return compiler;
+}
+
+std::vector<std::vector<char>> compile_hip_src(const std::vector<src_file>& srcs,
+                                               const std::vector<std::string>& params,
+                                               const std::string& arch)
+{
+    assert(not srcs.empty());
+
+    if(not is_hip_clang_compiler())
+        MIGRAPHX_THROW("Unknown hip compiler: " MIGRAPHX_HIP_COMPILER);
+
+    src_compiler compiler;
+    compiler.flags    = params;
+    compiler.compiler = MIGRAPHX_HIP_COMPILER;
+#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
+    if(has_compiler_launcher())
+        compiler.launcher = MIGRAPHX_HIP_COMPILER_LAUNCHER;
+#endif
+
+    if(std::none_of(params.begin(), params.end(), [](const std::string& s) {
+           return starts_with(s, "--std=") or starts_with(s, "-std=");
+       }))
+        compiler.flags.emplace_back("--std=c++17");
+    compiler.flags.emplace_back(" -fno-gpu-rdc");
+    if(enabled(MIGRAPHX_GPU_DEBUG_SYM{}))
+        compiler.flags.emplace_back("-g");
+    compiler.flags.emplace_back("-c");
+    compiler.flags.emplace_back("--offload-arch=" + arch);
+    compiler.flags.emplace_back("--cuda-device-only");
+    compiler.flags.emplace_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3") + " ");
+
+    if(enabled(MIGRAPHX_GPU_DEBUG{}))
+        compiler.flags.emplace_back("-DMIGRAPHX_DEBUG");
+
+    compiler.flags.emplace_back("-Wno-unused-command-line-argument");
+    compiler.flags.emplace_back("-Wno-cuda-compat");
+    compiler.flags.emplace_back(MIGRAPHX_HIP_COMPILER_FLAGS);
+
+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content) << std::endl;
+        }
+    }
+
+    if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
+    {
+
+        std::cout << assemble(compiler).compile(srcs).data() << std::endl;
+    }
+
+    return {compiler.compile(srcs)};
+}
+
+bool hip_has_flags(const std::vector<std::string>& flags)
+{
+    src_compiler compiler;
+    compiler.compiler = MIGRAPHX_HIP_COMPILER;
+    compiler.flags    = flags;
+    compiler.flags.emplace_back("-x hip");
+    compiler.flags.emplace_back("-c");
+    compiler.flags.emplace_back("--offload-arch=gfx900");
+    compiler.flags.emplace_back("--cuda-device-only");
+
+    std::string src;
+    src_file input{"main.cpp", src};
+
+    try
+    {
+        compiler.compile({input});
+        return true;
+    }
+    catch(...)
+    {
+        return false;
+    }
+}
+
+#endif // MIGRAPHX_USE_HIPRTC
+
+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_hip_code_object.cpp b/docker/rocm/migraphx/targets/gpu/compile_hip_code_object.cpp
new file mode 100644
index 000000000..dfd18ad7d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_hip_code_object.cpp
@@ -0,0 +1,215 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx_kernels.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::string generate_make_shape(const shape& s)
+{
+    return "make_shape(" + generate_index_ints(s.lens()) + ", " + generate_index_ints(s.strides()) +
+           ")";
+}
+
+static const char* const make_tensor_template = R"__migraphx__(
+template<>
+struct make_tensor<${n}>
+{
+    static __device__ auto apply(void* __restrict__ p)
+    {
+        return make_tensor_view(reinterpret_cast<${type}* __restrict__>(p), make_shape(${lens}, ${strides}));
+    }
+};
+)__migraphx__";
+
+std::string generate_make_tensor(std::size_t n, const shape& s)
+{
+    return interpolate_string(make_tensor_template,
+                              {{"n", std::to_string(n)},
+                               {"type", shape::cpp_type(s.type())},
+                               {"lens", generate_index_ints(s.lens())},
+                               {"strides", generate_index_ints(s.strides())}});
+}
+
+std::string generate_args_hpp(const std::vector<shape>& inputs)
+{
+    std::string inner;
+    for(std::size_t i = 0; i < inputs.size(); i++)
+    {
+        inner += generate_make_tensor(i, inputs[i]);
+    }
+    const std::string args_hpp = R"__migraphx__(
+#ifndef MIGRAPHX_GUARD_AUTO_ARGS_HPP
+#define MIGRAPHX_GUARD_AUTO_ARGS_HPP
+
+#include <migraphx/kernels/args.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/types.hpp>
+
+namespace migraphx {
+
+__content__
+
+} // namespace migraphx
+#endif
+)__migraphx__";
+    return replace_string(args_hpp, "__content__", inner);
+}
+
+static std::vector<std::string> get_compiler_warnings()
+{
+    std::vector<std::string> warnings = {
+        "-Weverything",
+        "-Wno-c++98-compat",
+        "-Wno-c++98-compat-pedantic",
+        "-Wno-conversion",
+        "-Wno-double-promotion",
+        "-Wno-exit-time-destructors",
+        "-Wno-extra-semi",
+        "-Wno-extra-semi-stmt",
+        "-Wno-float-conversion",
+        "-Wno-gnu-anonymous-struct",
+        "-Wno-gnu-zero-variadic-macro-arguments",
+        "-Wno-missing-prototypes",
+        "-Wno-nested-anon-types",
+        "-Wno-padded",
+        "-Wno-shorten-64-to-32",
+        "-Wno-sign-conversion",
+        "-Wno-sign-compare",
+        "-Wno-unused-command-line-argument",
+        "-Wno-weak-vtables",
+        "-Wno-c99-extensions",
+    };
+
+    if(hip_has_flags({"-Werror", "-Wunsafe-buffer-usage"}))
+        warnings.push_back("-Wno-unsafe-buffer-usage");
+    return warnings;
+}
+
+const std::vector<std::string>& compiler_warnings()
+{
+    static std::vector<std::string> warnings = get_compiler_warnings();
+    return warnings;
+}
+
+void hip_compile_options::set_launch_params(
+    const value& v,
+    const std::function<std::size_t(std::size_t local)>& compute_global,
+    std::size_t default_local)
+{
+    local = v.get("local", default_local);
+    if(v.contains("global"))
+        global = v.at("global").to<std::size_t>();
+    else
+        global = compute_global(local);
+}
+
+static bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
+
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over)
+{
+    assert(over > 0);
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    return [n, over, max_global](std::size_t local) {
+        std::size_t num_elements = n;
+        if(not hip_accept_non_uniform_wg())
+        {
+            num_elements = (1 + (n - 1) / local) * local;
+        }
+        std::size_t groups     = 1 + (num_elements - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
+        return std::min(nglobal, num_elements);
+    };
+}
+
+std::size_t compute_block_size(context& ctx, std::size_t n, std::size_t max_block_size)
+{
+    const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size();
+    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
+    return std::min(std::max(min_block_size, block_size), max_block_size);
+}
+
+operation
+compile_hip_code_object(context& ctx, const std::string& content, hip_compile_options options)
+{
+    assert(options.global > 0);
+    assert(options.local > 0);
+    assert(not options.inputs.empty());
+    assert(options.inputs.size() == options.virtual_inputs.size() or
+           options.virtual_inputs.empty());
+    std::vector<src_file> srcs = options.additional_src_files;
+    static auto kernels{::migraphx_kernels()};
+    std::transform(
+        kernels.begin(),
+        kernels.end(),
+        std::back_inserter(srcs),
+        [](const std::pair<std::string_view, std::string_view>& elem) { return src_file{elem}; });
+    srcs.emplace_back("main.cpp", content);
+    auto args_hpp =
+        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
+    srcs.emplace_back("args.hpp", args_hpp);
+
+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.emplace_param("-fno-offload-uniform-block");
+    else
+        assert(options.global % options.local == 0);
+
+    options.emplace_param("-DMIGRAPHX_NGLOBAL=" + std::to_string(options.global));
+    options.emplace_param("-DMIGRAPHX_NLOCAL=" + std::to_string(options.local));
+    options.emplace_param("-DMIGRAPHX_WAVEFRONTSIZE=" +
+                          std::to_string(ctx.get_current_device().get_wavefront_size()));
+    const auto& warnings = compiler_warnings();
+    options.params.insert(options.params.end(), warnings.begin(), warnings.end());
+    options.emplace_param("-ftemplate-backtrace-limit=0");
+    options.emplace_param("-Werror");
+    auto cos = compile_hip_src(srcs, options.params, get_device_name());
+    if(cos.size() != 1)
+        MIGRAPHX_THROW("No code object");
+    return code_object_op{value::binary{cos.front()},
+                          options.kernel_name,
+                          options.global,
+                          options.local,
+                          options.inputs,
+                          options.output,
+                          options.output_arg};
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_hipblaslt.cpp b/docker/rocm/migraphx/targets/gpu/compile_hipblaslt.cpp
new file mode 100644
index 000000000..c320e6b7d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_hipblaslt.cpp
@@ -0,0 +1,78 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if MIGRAPHX_USE_HIPBLASLT
+#include <migraphx/gpu/compile_hipblaslt.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+static size_t compile(migraphx::context& ctx, operation& op, instruction_ref ins)
+{
+    auto v = op.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
+    return v.get<std::size_t>("workspace", 0);
+}
+
+void compile_hipblaslt::apply(module& m) const
+{
+    assert(ctx);
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::hipblaslt_op")
+            continue;
+        auto op     = any_cast<hipblaslt_op>(ins->get_operator()).op;
+        auto inputs = ins->inputs();
+
+        std::size_t ws = hipblaslt_workspace_size;
+
+        auto alloc = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
+        inputs.insert(std::prev(inputs.end()), alloc);
+        m.replace_instruction(ins, op, inputs);
+
+        // Calculate workspace size
+        ws               = compile(*ctx, op, ins);
+        auto alloc_after = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
+
+        // Replace the workspace size with actual worksapce size needed.
+        auto it = std::find(inputs.begin(), inputs.end(), alloc);
+        if(it != inputs.end())
+        {
+            *it = alloc_after; // Replace `alloc` with `alloc_after`
+        }
+        m.replace_instruction(ins, op, inputs);
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_USE_HIPBLASLT
diff --git a/docker/rocm/migraphx/targets/gpu/compile_miopen.cpp b/docker/rocm/migraphx/targets/gpu/compile_miopen.cpp
new file mode 100644
index 000000000..583601bdd
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_miopen.cpp
@@ -0,0 +1,89 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_miopen.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/op/identity.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct miopen_op
+{
+    operation op = op::identity{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::miopen_op"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.push_back(inputs.back());
+        return op.compute_shape(inputs);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(miopen_op);
+
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
+{
+    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
+    return v.get<std::size_t>("workspace", 0);
+}
+
+void compile_miopen::apply(module& m) const
+{
+    assert(ctx);
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::miopen_op")
+            continue;
+        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
+        std::size_t ws = 0;
+        ws             = compile(op, ins);
+        auto inputs    = ins->inputs();
+        auto alloc     = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
+        inputs.insert(std::prev(inputs.end()), alloc);
+
+        m.replace_instruction(ins, op, inputs);
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_ops.cpp b/docker/rocm/migraphx/targets/gpu/compile_ops.cpp
new file mode 100644
index 000000000..cc5a7fc24
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_ops.cpp
@@ -0,0 +1,332 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/program.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/compile_ops.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/time_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_BENCHMARKING);
+
+struct precompile_op
+{
+    operation op                      = op::identity{};
+    std::size_t additional_args       = 1;
+    bool ignore_modules               = false;
+    std::optional<shape> output_shape = nullopt;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"),
+                    f(self.additional_args, "additional_args"),
+                    f(self.ignore_modules, "ignore_modules"),
+                    f(self.output_shape, "output_shape"));
+    }
+
+    std::string name() const { return "gpu::precompile_op"; }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        // Pop off additional args
+        inputs.resize(inputs.size() - additional_args);
+        if(output_shape.has_value())
+            return output_shape.value();
+        if(ignore_modules)
+            return op.compute_shape(inputs);
+        return op.compute_shape(inputs, mods);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+MIGRAPHX_REGISTER_OP(precompile_op);
+
+struct compiled_result
+{
+    compiler_replace replace;
+    instruction_ref ins;
+
+    friend std::ostream& operator<<(std::ostream& os, const compiled_result& cr)
+    {
+        cr.replace.trace(os, cr.ins);
+        return os;
+    }
+};
+
+struct compile_plan
+{
+    context* ctx;
+    operation preop;
+    instruction_ref ins;
+    optional<tuning_config> config                 = nullopt;
+    std::vector<optional<compiled_result>> results = {};
+    void update_config(bool exhaustive)
+    {
+        config = get_tuning_config(*ctx, ins, preop, exhaustive);
+    }
+    template <class Vector>
+    void insert_compiles(Vector& compiles, const value& solution, std::size_t i)
+    {
+        compiles.emplace_back([=] {
+            try
+            {
+                results[i] = compiled_result{compile(*ctx, ins, preop, solution), ins};
+            }
+            catch(const std::exception& e)
+            {
+                const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
+                if(trace_level > 0)
+                    std::cerr << "Exception in " + preop.name() + ": " + e.what() << std::endl;
+                results[i] = nullopt;
+            }
+            catch(...)
+            {
+                results[i] = nullopt;
+            }
+        });
+    }
+
+    template <class Vector>
+    void add_compiles(Vector& compiles)
+    {
+        if(config.has_value())
+        {
+            const auto& problem = config->problem;
+            if(auto sol = ctx->get_problem_cache().get(preop.name(), problem))
+            {
+                auto solution = sol.value();
+                // No solution yet until benchmarked so skip for now
+                if(solution.is_null())
+                    return;
+                results.resize(1);
+                insert_compiles(compiles, solution, 0);
+            }
+            else
+            {
+                ctx->get_problem_cache().mark(preop.name(), problem);
+                const auto& solutions = config->solutions;
+                if(solutions.empty())
+                    MIGRAPHX_THROW("No solutions provided for " + preop.name() + " with " +
+                                   to_string(problem));
+                results.resize(solutions.size());
+                for(auto i : range(solutions.size()))
+                {
+                    auto solution = solutions[i];
+                    insert_compiles(compiles, solution, i);
+                }
+            }
+        }
+        else
+        {
+            results.resize(1);
+            insert_compiles(compiles, value{}, 0);
+        }
+    }
+    std::string problem_string() const
+    {
+        if(config)
+            return to_string(config->problem);
+        return "<no problem key>";
+    }
+
+    const compiled_result& benchmark() const
+    {
+        const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
+        if(trace_level > 0 and not results.empty())
+        {
+            std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
+                      << std::endl;
+        }
+        if(results.empty())
+            MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
+                           problem_string());
+        if(results.size() == 1)
+        {
+            if(not results.front().has_value())
+                MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
+                               problem_string());
+            return *results.front();
+        }
+        if(not config)
+            MIGRAPHX_THROW("Multiple kernels without config for " + preop.name());
+        if(trace_level > 1)
+            std::cout << "Problem: " << config->problem << std::endl;
+        std::vector<double> times;
+        times.reserve(results.size());
+        std::transform(results.begin(),
+                       results.end(),
+                       config->solutions.begin(),
+                       std::back_inserter(times),
+                       [&](const auto& cr, const auto& solution) {
+                           if(trace_level > 1)
+                               std::cout << "Benchmarking solution: " << solution << std::endl;
+                           if(not cr.has_value())
+                           {
+                               if(trace_level > 1)
+                                   std::cout << "No binary" << std::endl;
+                               return std::numeric_limits<double>::max();
+                           }
+                           if(trace_level > 2)
+                               std::cout << *cr << std::endl;
+                           /*
+                           create a small program with insturction being compiled and call "replace"
+                           on that which would insert all the compiled code objects, prefills etc.
+                           necessary to run candidate code object
+                           */
+                           program bench_prog;
+                           auto* bench_mm = bench_prog.get_main_module();
+                           std::vector<instruction_ref> bench_ins_inputs;
+
+                           std::transform(cr->ins->inputs().begin(),
+                                          cr->ins->inputs().end(),
+                                          std::back_inserter(bench_ins_inputs),
+                                          [&](const auto& arg) {
+                                              return bench_mm->add_parameter(
+                                                  std::to_string(bench_ins_inputs.size()),
+                                                  arg->get_shape());
+                                          });
+                           auto bench_ins = bench_mm->add_instruction(
+                               cr->ins->get_operator(), bench_ins_inputs, cr->ins->module_inputs());
+                           cr->replace.replace(*bench_mm, bench_ins);
+                           // do dead code elimination by directly removing instruction
+                           bench_mm->remove_instruction(bench_ins);
+                           auto t = time_program(*ctx, bench_prog, 20);
+                           if(trace_level > 1)
+                               std::cout << t << "ms" << std::endl;
+                           return t;
+                       });
+        std::this_thread::sleep_for(std::chrono::milliseconds{50});
+        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
+        if(trace_level > 0)
+            std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
+        ctx->get_problem_cache().insert(preop.name(), config->problem, config->solutions.at(i));
+        if(not results[i].has_value())
+            MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
+                           problem_string());
+        auto skipped = std::count_if(
+            results.begin(), results.end(), [](const auto& cr) { return not cr.has_value(); });
+        if(skipped > 0)
+            std::cout << "Skipped " << skipped << " configs for " << preop.name() << std::endl;
+
+        return *results[i];
+    }
+
+    void replace(module& m) const
+    {
+        const auto& cr = benchmark();
+        cr.replace.replace(m, cr.ins);
+    }
+};
+
+template <class F>
+void par_compile(std::size_t n, F f)
+{
+    if(n == 0)
+        return;
+    auto d = value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{});
+    if(d == 0)
+        d = n;
+    par_for(n, n / d, f);
+}
+
+struct compile_manager
+{
+    std::vector<compile_plan> cps;
+    bool exhaustive = false;
+
+    template <class... Ts>
+    void add_plan(Ts&&... xs)
+    {
+        cps.push_back({std::forward<Ts>(xs)...});
+    }
+
+    void update_configs()
+    {
+        par_compile(cps.size(), [&](auto i) { cps[i].update_config(exhaustive); });
+    }
+
+    void compile(module& m)
+    {
+        std::vector<std::function<void()>> compiles;
+        for(auto& cp : cps)
+        {
+            cp.add_compiles(compiles);
+        }
+        par_compile(compiles.size(), [&](auto i) { compiles[i](); });
+
+        // Replace and/or benchmark
+        for(const auto& cp : cps)
+        {
+            if(cp.results.empty())
+                continue;
+            cp.replace(m);
+        }
+
+        // Remove compile_plan already executed
+        cps.erase(std::remove_if(cps.begin(),
+                                 cps.end(),
+                                 [](const auto& cp) { return not cp.results.empty(); }),
+                  cps.end());
+    }
+};
+
+void compile_ops::apply(module& m) const
+{
+    compile_manager cm;
+    cm.exhaustive = exhaustive_tune;
+    // Find all precompile ops
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::precompile_op")
+            continue;
+        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
+        cm.add_plan(ctx, preop, ins);
+    }
+    cm.update_configs();
+    cm.compile(m);
+    // Compile already tuned configs
+    cm.compile(m);
+    assert(cm.cps.empty());
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compile_pointwise.cpp b/docker/rocm/migraphx/targets/gpu/compile_pointwise.cpp
new file mode 100644
index 000000000..ee682cf2c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compile_pointwise.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+operation
+compile_pointwise(context& ctx, const std::vector<migraphx::shape>& in_shapes, const_module_ref pm)
+{
+    auto pf            = gen::generate_pointwise(*pm, "inner_pointwise", true);
+    std::string lambda = "MIGRAPHX_LIFT(inner_pointwise)";
+    auto kernel_name   = gen::generate_name_from_ops(*pm, "kernel");
+    return gpu::compile_op("pointwise",
+                           ctx,
+                           in_shapes,
+                           {{"lambda", lambda}, {"preamble", pf}, {"kernel", kernel_name}});
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/compiler.cpp b/docker/rocm/migraphx/targets/gpu/compiler.cpp
new file mode 100644
index 000000000..3b3b786e2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/compiler.cpp
@@ -0,0 +1,74 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+namespace {
+struct compiler_handle
+{
+    compiler_compile compile;
+    compiler_compile_op compile_op;
+    compiler_tuning_config get_tuning_config;
+};
+} // namespace
+
+auto& compiler_map()
+{
+    static std::unordered_map<std::string, compiler_handle> m; // NOLINT
+    return m;
+}
+
+void register_compiler(const std::string& name,
+                       compiler_compile c,
+                       compiler_compile_op cop,
+                       compiler_tuning_config ctg)
+{
+    compiler_map()[name] = {std::move(c), std::move(cop), std::move(ctg)};
+}
+
+bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
+compiler_replace
+compile(context& ctx, instruction_ref ins, const operation& op, const value& solution)
+{
+    return compiler_map().at(op.name()).compile(ctx, ins, op, solution);
+}
+operation
+compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
+{
+    return compiler_map().at(name).compile_op(ctx, inputs, v);
+}
+
+optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive)
+{
+    return compiler_map().at(op.name()).get_tuning_config(ctx, ins, op, exhaustive);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/argmax.cpp b/docker/rocm/migraphx/targets/gpu/device/argmax.cpp
new file mode 100644
index 000000000..e71a1b955
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/argmax.cpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmax(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            int64_t axis,
+            bool select_last_index)
+{
+    if(select_last_index)
+        arg_op(argmax_op_last_index{}, stream, result, arg, axis);
+    else
+        arg_op(argmax_op_first_index{}, stream, result, arg, axis);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/argmin.cpp b/docker/rocm/migraphx/targets/gpu/device/argmin.cpp
new file mode 100644
index 000000000..18338bc48
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/argmin.cpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/arg_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmin(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            int64_t axis,
+            bool select_last_index)
+{
+    if(select_last_index)
+        arg_op(argmin_op_last_index{}, stream, result, arg, axis);
+    else
+        arg_op(argmin_op_first_index{}, stream, result, arg, axis);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/contiguous.cpp b/docker/rocm/migraphx/targets/gpu/device/contiguous.cpp
new file mode 100644
index 000000000..7d30aec54
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/contiguous.cpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void contiguous_nonstandard(hipStream_t stream, const argument& result, const argument& arg)
+{
+    shape s{result.get_shape().type(), result.get_shape().lens()};
+    visit_all(result, arg)([&](auto output_v, auto input_v) {
+        hip_visit_views(output_v, input_v, s)([&](auto output, auto input, auto standard_shape) {
+            mi_gs_launch(stream,
+                         standard_shape)([=](auto idx) __device__ { output[idx] = input[idx]; });
+        });
+    });
+}
+
+void contiguous_packed(hipStream_t stream, const argument& result, const argument& arg)
+{
+    index_int nelements = result.get_shape().elements();
+    visit_all(result, arg)([&](auto output_v, auto input_v) {
+        const auto* input = device_cast(input_v.data());
+        auto* output      = device_cast(output_v.data());
+        gs_launch(stream, nelements)([=](auto i) __device__ { output[i] = input[i]; });
+    });
+}
+
+void contiguous(hipStream_t stream, const argument& result, const argument& arg)
+{
+    if(result.get_shape() == arg.get_shape() and result.get_shape().packed())
+        contiguous_packed(stream, result, arg);
+    else
+        contiguous_nonstandard(stream, result, arg);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/fill.cpp b/docker/rocm/migraphx/targets/gpu/device/fill.cpp
new file mode 100644
index 000000000..ea6640b7e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/fill.cpp
@@ -0,0 +1,40 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/device/fill.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void fill(hipStream_t stream, const argument& result, unsigned long val)
+{
+    nary(stream, result)([=]() __device__ { return val; });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/array.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/array.hpp
new file mode 100644
index 000000000..41d58e667
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -0,0 +1,185 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                                    \
+    MIGRAPHX_DEVICE_CONSTEXPR hip_array& operator op(const hip_array& x)                           \
+    {                                                                                              \
+        for(index_int i = 0; i < N; i++)                                                           \
+            d[i] op x[i];                                                                          \
+        return *this;                                                                              \
+    }                                                                                              \
+    MIGRAPHX_DEVICE_CONSTEXPR hip_array& operator op(const T& x)                                   \
+    {                                                                                              \
+        for(index_int i = 0; i < N; i++)                                                           \
+            d[i] op x;                                                                             \
+        return *this;                                                                              \
+    }                                                                                              \
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(hip_array x, const hip_array& y) \
+    {                                                                                              \
+        return x op y;                                                                             \
+    }                                                                                              \
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(hip_array x, const T& y)         \
+    {                                                                                              \
+        return x op y;                                                                             \
+    }                                                                                              \
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator binary_op(const T& y, hip_array x)         \
+    {                                                                                              \
+        return x op y;                                                                             \
+    }
+
+template <class T, index_int N>
+struct hip_array
+{
+    T d[N];
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](index_int i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](index_int i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[N - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[N - 1]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::integral_constant<index_int, N> size() const { return {}; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T dot(const hip_array& x) const
+    {
+        T result = 0;
+        for(index_int i = 0; i < N; i++)
+            result += x[i] * d[i];
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T product() const
+    {
+        T result = 1;
+        for(index_int i = 0; i < N; i++)
+            result *= d[i];
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T single(index_int width = 100) const
+    {
+        T result = 0;
+        T a      = 1;
+        for(index_int i = 0; i < N; i++)
+        {
+            result += d[N - i - 1] * a;
+            a *= width;
+        }
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
+    MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
+    MIGRAPHX_DEVICE_ARRAY_OP(/=, /)
+    MIGRAPHX_DEVICE_ARRAY_OP(%=, %)
+    MIGRAPHX_DEVICE_ARRAY_OP(&=, &)
+    MIGRAPHX_DEVICE_ARRAY_OP(|=, |)
+    MIGRAPHX_DEVICE_ARRAY_OP(^=, ^)
+
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator==(const hip_array& x, const hip_array& y)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(x[i] != y[i])
+                return false;
+        }
+        return true;
+    }
+
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator!=(const hip_array& x, const hip_array& y)
+    {
+        return not(x == y);
+    }
+    // This uses the product order rather than lexical order
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<(const hip_array& x, const hip_array& y)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(not(x[i] < y[i]))
+                return false;
+        }
+        return true;
+    }
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator>(const hip_array& x, const hip_array& y)
+    {
+        return y < x;
+    }
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<=(const hip_array& x, const hip_array& y)
+    {
+        return (x < y) or (x == y);
+    }
+    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator>=(const hip_array& x, const hip_array& y)
+    {
+        return (y < x) or (x == y);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR hip_array carry(hip_array result) const
+    {
+        uint32_t overflow = 0;
+        for(std::ptrdiff_t i = result.size() - 1; i > 0; i--)
+        {
+            auto z = result[i] + overflow;
+            // Reset overflow
+            overflow = 0;
+            // Compute overflow using while loop instead of mod
+            while(z >= d[i])
+            {
+                z -= d[i];
+                overflow += 1;
+            }
+            result[i] = z;
+        }
+        result[0] += overflow;
+        return result;
+    }
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
new file mode 100644
index 000000000..70c355135
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
@@ -0,0 +1,70 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_FAST_DIV_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_FAST_DIV_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+constexpr const uint64_t fast_div_shift = 42;
+inline uint64_t encode_divisor(uint64_t divisor)
+{
+    if(divisor == 0)
+        return 0;
+    auto p = uint64_t{1} << fast_div_shift;
+    return (p + divisor - 1) / divisor;
+}
+
+inline constexpr bool is_divisor_encodable(uint64_t i)
+{
+    return i < (uint64_t{1} << (fast_div_shift / 2));
+}
+
+MIGRAPHX_DEVICE_CONSTEXPR uint64_t fast_div(uint64_t dividend, uint64_t encoded_divisor)
+{
+    return (dividend * encoded_divisor) >> fast_div_shift;
+}
+
+MIGRAPHX_DEVICE_CONSTEXPR uint64_t remainder(uint64_t result, uint64_t dividend, uint64_t divisor)
+{
+    return dividend - divisor * result;
+}
+
+MIGRAPHX_DEVICE_CONSTEXPR uint64_t fast_mod(uint64_t dividend,
+                                            uint64_t divisor,
+                                            uint64_t encoded_divisor)
+{
+    return remainder(fast_div(dividend, encoded_divisor), dividend, divisor);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
new file mode 100644
index 000000000..a5f18fc5a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
@@ -0,0 +1,74 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+
+#include <migraphx/requires.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+
+template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return std::isfinite(x) and std::isfinite(y) and
+           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+}
+
+template <>
+__device__ bool float_equal_device(__bf16 x, __bf16 y) // NOLINT(misc-definitions-in-headers)
+{
+    float xf = x;
+    float yf = y;
+    return std::isfinite(xf) and std::isfinite(yf) and
+           std::nextafter(xf, std::numeric_limits<float>::lowest()) <= yf and
+           std::nextafter(xf, std::numeric_limits<float>::max()) >= yf;
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return x == y;
+}
+
+template <class T, class U>
+__device__ bool float_equal(T x, U y)
+{
+    return float_equal_device<common_type<T, U>>(x, y);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/launch.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
new file mode 100644
index 000000000..573f57b3b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -0,0 +1,146 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+
+#include <hip/hip_runtime.h>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/targets.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+struct index
+{
+    index_int global = 0;
+    index_int local  = 0;
+    index_int group  = 0;
+
+    __device__ index_int nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
+
+    __device__ index_int nlocal() const { return blockDim.x; }              // NOLINT
+
+    template <class F>
+    __device__ void global_stride(index_int n, F f) const
+    {
+        const auto stride = nglobal();
+        for(index_int i = global; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
+
+    template <class F>
+    __device__ void local_stride(index_int n, F f) const
+    {
+        const auto stride = nlocal();
+        for(index_int i = local; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
+};
+
+template <class F>
+__global__ void launcher(F f)
+{
+    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+    f(idx);
+}
+
+inline auto launch(hipStream_t stream, index_int global, index_int local)
+{
+    return [=](auto f) {
+        assert(local > 0);
+        assert(global > 0);
+        using f_type = decltype(f);
+        dim3 nblocks(global / local);
+        dim3 nthreads(local);
+        /*
+        hipGetLastError() returns error for the first failed HIP call that happened previously.
+        MIGraphX calls into various backend libraries and failed HIP calls can also happen there.
+        Calling hipGetLastError() would reset error code to hipSuccess, so that inside MIGraphX
+        failed call to hipLaunchKernelGGL() can be captured.
+        */
+        hipError_t flush_call = hipGetLastError();
+        (void)(flush_call);
+        // cppcheck-suppress migraphx-UseDeviceLaunch
+        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
+        hipError_t kernel_launch_status = hipGetLastError();
+        if(kernel_launch_status != hipSuccess)
+        {
+            std::string message = hipGetErrorString(kernel_launch_status);
+            if(not contains(get_targets(), get_device_name()))
+            {
+                message += ". Trying to run a kernel for " + get_device_name() +
+                           " but MIGraphX was built for targets " + get_targets_as_string() +
+                           ". Please rebuild MIGraphX with -DGPU_TARGETS='" + get_device_name() +
+                           "'.";
+            }
+            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " + message);
+        }
+    };
+}
+
+template <class F>
+MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index idx) -> decltype(f(i, idx))
+{
+    return f(i, idx);
+}
+
+template <class F>
+MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(f(i))
+{
+    return f(i);
+}
+
+inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
+{
+    index_int groups = (n + local - 1) / local;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * local;
+
+    return [=](auto f) {
+        launch(stream, nglobal, local)([=](auto idx) __device__ {
+            idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); });
+        });
+    };
+}
+
+#ifdef MIGRAPHX_USE_CLANG_TIDY
+#define MIGRAPHX_DEVICE_SHARED
+#else
+#define MIGRAPHX_DEVICE_SHARED __shared__
+#endif
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
new file mode 100644
index 000000000..6be513a88
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -0,0 +1,164 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_MULTI_INDEX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MULTI_INDEX_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/shape.hpp>
+#include <migraphx/functional.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <index_int N>
+struct multi_index
+{
+    using hip_index = hip_array<index_int, N>;
+    hip_index id{};
+    hip_index stride{};
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto for_stride(hip_index n) const
+    {
+        // f should return void, but this helps with type deduction
+        return [=](auto f) -> decltype(f(hip_index{})) {
+            for(hip_index i = id; i < n; i = n.carry(i + stride))
+            {
+                f(i);
+            }
+        };
+    }
+};
+
+template <class ForStride>
+__device__ __host__ auto deduce_for_stride(ForStride fs) -> decltype(fs(id{}));
+
+MIGRAPHX_DEVICE_CONSTEXPR multi_index<1> make_multi_index(index_int i, index_int n)
+{
+    return {{i}, {n}};
+}
+
+template <index_int N>
+MIGRAPHX_DEVICE_CONSTEXPR multi_index<N>
+make_multi_index(const hip_shape<N>& s, index_int i, index_int n)
+{
+    return {s.multi(i), s.multi(n)};
+}
+
+template <index_int N>
+MIGRAPHX_DEVICE_CONSTEXPR multi_index<N>
+make_multi_index(const hip_shape<N>& s, index_int i, const hip_array<index_int, N>& n)
+{
+    return {s.multi(i), n};
+}
+
+template <index_int N>
+inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
+{
+    assert(s.standard);
+    assert(s.elements() > 0);
+    index_int n      = s.elements();
+    index_int groups = (n + nlocal - 1) / nlocal;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * nlocal;
+
+    assert(groups > 0);
+    assert(nglobal > 0);
+    auto nglobal_multi = s.multi(nglobal);
+
+    // Skip checking this, since this will cause metadata to not be generated
+    // for some unknown reason.
+    //
+    // assert(std::any_of(nglobal_multi.begin(), nglobal_multi.end(), [](auto x){return x>0;}));
+
+    // cppcheck-suppress migraphx-RedundantLocalVariable
+    return nglobal_multi;
+}
+
+template <index_int N>
+inline auto mi_nlocal(const hip_shape<N>& s, index_int local)
+{
+    assert(s.standard);
+    assert(s.elements() > 0);
+    auto nlocal_multi = s.multi(local);
+
+    // Skip checking this, since this will cause metadata to not be generated
+    // for some unknown reason.
+    //
+    // assert(std::any_of(nlocal_multi.begin(), nlocal_multi.end(), [](auto x){return x>0;}));
+
+    // cppcheck-suppress migraphx-RedundantLocalVariable
+    return nlocal_multi;
+}
+
+template <index_int N>
+inline auto mi_launch(hipStream_t stream, const hip_shape<N>& global, index_int nlocal = 1024)
+{
+    auto nglobal_multi = mi_nglobal(global, nlocal);
+    auto nglobal       = global.index(nglobal_multi);
+
+    return [=](auto f) {
+        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
+            auto midx = make_multi_index(global, idx.global, nglobal_multi);
+            f(idx, midx.for_stride(global.lens));
+        });
+    };
+}
+
+template <index_int N>
+inline auto mi_launch(hipStream_t stream,
+                      const hip_shape<N>& global,
+                      const hip_shape<N>& local,
+                      index_int nlocal = 1024)
+{
+    auto nglobal_multi = mi_nglobal(global, 1);
+    auto nglobal       = global.index(nglobal_multi);
+    auto nlocal_multi  = mi_nlocal(local, nlocal);
+
+    return [=](auto f) {
+        launch(stream, nglobal * nlocal, nlocal)([=](auto idx) {
+            // TODO: Use fast div for nlocal
+            auto midx = make_multi_index(global, idx.global / nlocal, nglobal_multi);
+            auto lidx = make_multi_index(local, idx.local, nlocal_multi);
+            f(idx, midx.for_stride(global.lens), lidx.for_stride(local.lens));
+        });
+    };
+}
+
+template <index_int N>
+inline auto mi_gs_launch(hipStream_t stream, const hip_shape<N>& global, index_int nlocal = 1024)
+{
+    return [=](auto f) {
+        mi_launch(stream, global, nlocal)([=](auto, auto g) { g([&](auto i) { f(i); }); });
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/nary.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
new file mode 100644
index 000000000..e9af38473
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -0,0 +1,473 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
+
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/multi_index.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/array.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/config.hpp>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_NARY);
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_TRACE_NARY_FUNCTION   \
+    if(enabled(MIGRAPHX_TRACE_NARY{})) \
+        std::cout << "nary device function: " << __PRETTY_FUNCTION__ << std::endl;
+
+template <class... Ts>
+constexpr auto pack(Ts... xs)
+{
+    return [=](auto f) { return f(xs...); };
+}
+
+template <class F, class... Arguments>
+auto nary_nonstandard_nonpacked_impl(hipStream_t stream, F f, argument result, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    shape s{result.get_shape().type(), result.get_shape().lens()};
+    hip_visit_all(s, result, args...)([&](auto standard_shape, auto output, auto... inputs) {
+        mi_gs_launch(stream,
+                     standard_shape)([=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
+    });
+}
+
+inline auto create_broadcast_index(index_int len, index_int stride)
+{
+    auto next_stride   = stride * len;
+    auto e_next_stride = encode_divisor(next_stride);
+    auto e_stride      = encode_divisor(stride);
+    return [=](auto i) __device__ {
+        // ( i % next_stride) / stride
+        return fast_div(i, e_stride) - len * fast_div(i, e_next_stride);
+    };
+}
+
+template <class F, class... Arguments>
+auto nary_nonstandard_packed_impl(hipStream_t stream,
+                                  F f,
+                                  const argument& result,
+                                  Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    auto arg_shape = make_array(args...).front().get_shape();
+    auto perm      = find_permutation(arg_shape);
+    auto s         = reorder_shape(arg_shape, perm);
+    hip_visit_all(s, result.reshape(reorder_shape(result.get_shape(), perm)), args.reshape(s)...)(
+        [&](auto standard_shape, auto output, auto... inputs) {
+            mi_gs_launch(stream, standard_shape)(
+                [=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
+        });
+}
+
+template <class F, class... Arguments>
+void nary_broadcast_vec_impl(
+    hipStream_t stream, F f, argument result, argument barg, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    const auto& output_shape = result.get_shape();
+    const auto& b_shape      = barg.get_shape();
+    auto bdim =
+        std::distance(b_shape.strides().begin(),
+                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
+                          return x != 0;
+                      }));
+    auto bdim_len      = output_shape.lens()[bdim];
+    auto bdim_stride   = output_shape.strides()[bdim];
+    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
+
+    const index_int vec_size     = 4;
+    const index_int nlocal       = 1024;
+    const index_int nglobal      = 256 * nlocal;
+    const index_int bdim_vec_len = bdim_len / vec_size;
+    hip_vec_visit_all<vec_size>(result, barg, args...)(
+        [&](auto output, auto binput, auto... inputs) {
+            using type                = typename decltype(output)::value_type;
+            const index_int nelements = output.size() / vec_size;
+            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
+                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
+                // Load bias into LDS
+                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                {
+                    buffer[i] = binput.data()[i];
+                }
+                __syncthreads();
+                const auto* bp = as_pointer(buffer);
+                // Process the data
+                for(size_t i = idx.global; i < nelements; i += nglobal)
+                {
+                    auto bidx = broadcast_idx(i * vec_size);
+                    auto b    = bp[bidx];
+                    auto out  = output.data()[i];
+                    for(index_int j = 0; j < vec_size; j++)
+                    {
+                        out[j] = f(inputs.data()[i][j]..., b);
+                    }
+                    output.data()[i] = out;
+                }
+            });
+        });
+}
+
+template <class F, class... Arguments>
+void nary_broadcast_impl(hipStream_t stream, F f, argument result, argument barg, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    const auto& output_shape = result.get_shape();
+    const auto& b_shape      = barg.get_shape();
+    auto bdim =
+        std::distance(b_shape.strides().begin(),
+                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
+                          return x != 0;
+                      }));
+    auto bdim_len      = output_shape.lens()[bdim];
+    auto bdim_stride   = output_shape.strides()[bdim];
+    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
+
+    const index_int nlocal  = 1024;
+    const index_int nglobal = 256 * nlocal;
+    index_int nelements     = result.get_shape().elements();
+    hip_visit_all(result, barg, args...)([&](auto output, auto binput, auto... inputs) {
+        using type = typename decltype(output)::value_type;
+        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
+            MIGRAPHX_DEVICE_SHARED type buffer[2048];
+            // Load bias into LDS
+            for(size_t i = idx.local; i < bdim_len; i += nlocal)
+            {
+                buffer[i] = binput.data()[i];
+            }
+            __syncthreads();
+            // Process the data
+            for(size_t i = idx.global; i < nelements; i += nglobal)
+            {
+                auto bidx        = broadcast_idx(i);
+                auto b           = buffer[bidx];
+                output.data()[i] = f(inputs.data()[i]..., b);
+            }
+        });
+    });
+}
+
+template <class F, class... Arguments>
+void nary_double_broadcast_vec_impl(
+    hipStream_t stream, F f, argument result, argument barg1, argument barg2, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    assert(barg1.get_shape().broadcasted());
+    assert(barg2.get_shape().broadcasted());
+    assert(barg1.get_shape() == barg2.get_shape());
+    const auto& output_shape = result.get_shape();
+    const auto& b_shape      = barg1.get_shape();
+    auto bdim =
+        std::distance(b_shape.strides().begin(),
+                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
+                          return x != 0;
+                      }));
+    auto bdim_len      = output_shape.lens()[bdim];
+    auto bdim_stride   = output_shape.strides()[bdim];
+    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
+
+    const index_int vec_size     = 4;
+    const index_int nlocal       = 1024;
+    const index_int nglobal      = 256 * nlocal;
+    const index_int bdim_vec_len = bdim_len / vec_size;
+    hip_vec_visit_all<vec_size>(result, barg1, barg2, args...)(
+        [&](auto output, auto binput1, auto binput2, auto... inputs) {
+            using type                = typename decltype(output)::value_type;
+            const index_int nelements = output.size() / vec_size;
+            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
+                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
+                // Load bias into LDS
+                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                {
+                    buffer[i] = binput1.data()[i];
+                }
+                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                {
+                    buffer[i + bdim_vec_len] = binput2.data()[i];
+                }
+                __syncthreads();
+                const auto* bp = as_pointer(buffer);
+                // Process the data
+                for(size_t i = idx.global; i < nelements; i += nglobal)
+                {
+                    auto bidx = broadcast_idx(i * vec_size);
+                    auto b1   = bp[bidx];
+                    auto b2   = bp[bidx + bdim_len];
+                    auto out  = output.data()[i];
+                    for(index_int j = 0; j < vec_size; j++)
+                    {
+                        out[j] = f(inputs.data()[i][j]..., b2, b1);
+                    }
+                    output.data()[i] = out;
+                }
+            });
+        });
+}
+
+template <class F, class... Arguments>
+void nary_double_broadcast_impl(
+    hipStream_t stream, F f, argument result, argument barg1, argument barg2, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    assert(barg1.get_shape().broadcasted());
+    assert(barg2.get_shape().broadcasted());
+    assert(barg1.get_shape() == barg2.get_shape());
+    const auto& output_shape = result.get_shape();
+    const auto& b_shape      = barg1.get_shape();
+    auto bdim =
+        std::distance(b_shape.strides().begin(),
+                      std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
+                          return x != 0;
+                      }));
+    auto bdim_len      = output_shape.lens()[bdim];
+    auto bdim_stride   = output_shape.strides()[bdim];
+    auto broadcast_idx = create_broadcast_index(bdim_len, bdim_stride);
+
+    const index_int nlocal  = 1024;
+    const index_int nglobal = 256 * nlocal;
+    index_int nelements     = result.get_shape().elements();
+    hip_visit_all(result, barg1, barg2, args...)(
+        [&](auto output, auto binput1, auto binput2, auto... inputs) {
+            using type = typename decltype(output)::value_type;
+            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
+                MIGRAPHX_DEVICE_SHARED type buffer[2048];
+                // Load bias into LDS
+                for(size_t i = idx.local; i < bdim_len; i += nlocal)
+                {
+                    buffer[i] = binput1.data()[i];
+                }
+                for(size_t i = idx.local; i < bdim_len; i += nlocal)
+                {
+                    buffer[i + bdim_len] = binput2.data()[i];
+                }
+                __syncthreads();
+                // Process the data
+                for(size_t i = idx.global; i < nelements; i += nglobal)
+                {
+                    auto bidx        = broadcast_idx(i);
+                    auto b1          = buffer[bidx];
+                    auto b2          = buffer[bidx + bdim_len];
+                    output.data()[i] = f(inputs.data()[i]..., b2, b1);
+                }
+            });
+        });
+}
+
+template <class F, class... Arguments>
+void nary_standard_vec_impl(hipStream_t stream, F f, argument result, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    const auto& output_shape = result.get_shape();
+    visit_all(result, args...)([&](auto output, auto... inputs) {
+        using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+        const index_int vec_size = 4;
+        auto data                = pack_vec<4>(device_cast(inputs.data())...);
+        auto* outp               = as_vec<4>(device_cast(output.data()));
+        gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) __device__ {
+            vec<type, 4> out = outp[i];
+            data(
+                [&](auto... xs) {
+                    for(index_int j = 0; j < vec_size; j++)
+                    {
+                        out[j] = f(xs[j]...);
+                    }
+                },
+                i);
+            outp[i] = out;
+        });
+    });
+}
+
+template <class F, class... Arguments>
+void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    index_int nelements = result.get_shape().elements();
+    hip_pointer_visit_all(result, args...)([&](auto output, auto... inputs) {
+        gs_launch(stream, nelements)([=](auto i) __device__ { output[i] = f(inputs[i]...); });
+    });
+}
+
+template <class F, class... Arguments>
+void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
+{
+    MIGRAPHX_TRACE_NARY_FUNCTION
+    const auto shapes   = make_array(args.get_shape()...);
+    const bool standard = all_of(shapes, [](const shape& s) { return s.standard(); });
+    const bool packed =
+        all_of(shapes, [](const shape& s) { return s.packed() and not s.broadcasted(); });
+    const bool same_shapes =
+        all_of(shapes, [&](const shape& s) { return s == result.get_shape(); });
+    const bool same_input_shapes = all_of(shapes, [&](const shape& s) { return s == shapes[0]; });
+    if((result.get_shape().standard() and standard) or (packed and same_shapes))
+        nary_standard_impl(stream, f, result, args...);
+    else if(packed and same_input_shapes)
+        nary_nonstandard_packed_impl(stream, f, result, args...);
+    else
+        nary_nonstandard_nonpacked_impl(stream, f, result, args...);
+}
+
+template <class... Arguments>
+auto nary_nonstandard(hipStream_t stream, argument result, Arguments... args)
+{
+    return [=](auto f) { nary_nonstandard_nonpacked_impl(stream, f, result, args...); };
+}
+
+template <class... Arguments>
+auto nary_standard(hipStream_t stream, argument result, Arguments... args)
+{
+    return [=](auto f) { nary_standard_impl(stream, f, result, args...); };
+}
+
+template <class... Arguments>
+bool broadcastable(bool& divisible_by_4,
+                   index_int max_size,
+                   const argument& result,
+                   const argument& barg,
+                   const Arguments&... args)
+{
+    divisible_by_4 = false;
+    auto bshape    = barg.get_shape();
+    const bool standard =
+        all_of({args.get_shape()...}, [](const shape& s) { return s.standard(); });
+    const bool same_shapes =
+        all_of({args.get_shape()...}, [&](const shape& s) { return s == result.get_shape(); });
+    // TODO: Check result and args shape is the same
+    if(standard and same_shapes and bshape.broadcasted() and not bshape.scalar())
+    {
+        auto not_zero       = [](auto x) { return x != 0; };
+        const auto& strides = bshape.strides();
+        auto b_it           = std::find_if(strides.begin(), strides.end(), not_zero);
+        auto b_idx          = std::distance(strides.begin(), b_it);
+        auto b_len          = result.get_shape().lens()[b_idx];
+        auto b_stride       = result.get_shape().strides()[b_idx];
+        assert(bshape.lens()[b_idx] == b_len);
+        if(b_len <= max_size and std::none_of(std::next(b_it), strides.end(), not_zero) and
+           is_divisor_encodable(b_stride * b_len))
+        {
+
+            divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
+                             (front_args(args...).get_shape().elements() % 4 == 0);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline bool broadcastable(bool& divisible_by_4, index_int, const argument&, const argument&)
+{
+    divisible_by_4 = false;
+    return false;
+}
+
+// Nullary
+inline auto nary(hipStream_t stream, argument result)
+{
+    return [=](auto f) { nary_standard_impl(stream, f, result); };
+}
+
+// Unary
+inline auto nary(hipStream_t stream, argument result, argument arg)
+{
+    return [=](auto f) { nary_impl(stream, f, result, arg); };
+}
+
+// Binary
+inline auto nary(hipStream_t stream, argument result, argument arg, argument barg)
+{
+    return [=](auto f) {
+        bool divisible_by_4 = false;
+        if(broadcastable(divisible_by_4, 2048, result, barg, arg))
+        {
+            if(divisible_by_4)
+                nary_broadcast_vec_impl(stream, f, result, barg, arg);
+            else
+                nary_broadcast_impl(stream, f, result, barg, arg);
+        }
+        else
+        {
+            nary_impl(stream, f, result, arg, barg);
+        }
+    };
+}
+
+template <class... Arguments>
+auto nary(hipStream_t stream, argument result, Arguments... args)
+{
+    static_assert(sizeof...(args) > 2, "Args needs to be greater than 2");
+    return [=](auto f) {
+        auto barg1     = back_args(args...);
+        bool fallback1 = pop_back_args(args...)([&](auto&&... args2) {
+            auto barg2 = back_args(args2...);
+            bool fallback2 =
+                barg2.get_shape() != barg1.get_shape() or not barg2.get_shape().broadcasted() or
+                pop_back_args(args2...)([&](auto&&... args3) {
+                    bool divisible_by_4 = false;
+                    if(broadcastable(divisible_by_4, 1024, result, barg2, args3...))
+                    {
+                        if(divisible_by_4)
+                            nary_double_broadcast_vec_impl(
+                                stream, f, result, barg1, barg2, args3...);
+                        else
+                            nary_double_broadcast_impl(stream, f, result, barg1, barg2, args3...);
+                        return false;
+                    }
+                    return true;
+                });
+            if(not fallback2)
+                return false;
+            bool divisible_by_4 = false;
+            if(broadcastable(divisible_by_4, 2048, result, barg1, args2...))
+            {
+                if(divisible_by_4)
+                    nary_broadcast_vec_impl(stream, f, result, barg1, args2...);
+                else
+                    nary_broadcast_impl(stream, f, result, barg1, args2...);
+                return false;
+            }
+            return true;
+        });
+        if(fallback1)
+            nary_impl(stream, f, result, args...);
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
new file mode 100644
index 000000000..ae796c66e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -0,0 +1,311 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/gpu/device/multi_index.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+#ifdef MIGRAPHX_NO_DPP
+
+template <index_int N,
+          class Op,
+          class T,
+          class ForStride,
+          class F,
+          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
+__device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
+{
+    using type = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N];
+    type x = init;
+    fs([&](auto i) { x = op(x, f(i)); });
+    buffer[idx.local] = x;
+    __syncthreads();
+
+    for(index_int s = 1; s < idx.nlocal(); s *= 2)
+    {
+        const index_int index = 2 * s * idx.local;
+        if(index + s < idx.nlocal())
+        {
+            buffer[index] = op(buffer[index], buffer[index + s]);
+        }
+        __syncthreads();
+    }
+    return buffer[0];
+}
+
+#else
+constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
+
+constexpr unsigned int dpp_row_bcast(unsigned int x)
+{
+    unsigned int y = 0;
+    switch(x)
+    {
+    case 15: y = 0x142; break;
+    case 31: y = 0x143; break;
+    default: throw std::runtime_error("Unknown bcast");
+    }
+    return y;
+}
+
+template <unsigned int DppCtrl,
+          unsigned int RowMask  = 0xf,
+          unsigned int BankMask = 0xf,
+          bool BoundCtrl        = false,
+          class T>
+__device__ T dpp_mov(T& x)
+{
+    static const index_int n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
+    union type
+    {
+        uint32_t reg[n];
+        T data;
+    };
+    type output{};
+    type input{};
+    // cppcheck-suppress unreadVariable
+    input.data = x;
+    for(index_int i = 0; i < n; i++)
+    {
+        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+    }
+    return output.data;
+}
+
+template <class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    T out{};
+    out = dpp_mov<dpp_row_shr(1)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(2)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
+    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
+    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
+    in  = op(in, out);
+#endif
+}
+
+__device__ inline void dpp_reduce(float& x, sum)
+{
+#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
+    x = 1;
+#else
+    __asm__ volatile("s_nop 4\n"
+                     "v_add_f32 %0 %0 %0 row_shr:1\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:2\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
+                     "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
+                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
+                     "s_nop 1\n"
+                     : "=v"(x)
+                     : "0"(x));
+#endif
+}
+
+template <index_int N,
+          class Op,
+          class T,
+          class ForStride,
+          class F,
+          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
+__device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
+{
+
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
+    using type                   = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
+    type x = init;
+    fs([&](auto i) { x = op(x, f(i)); });
+    dpp_reduce(x, op);
+
+    const auto ldsidx = idx.local / nthreads;
+    if((idx.local % nthreads) == nthreads - 1)
+    {
+        buffer[ldsidx] = x;
+    }
+    __syncthreads();
+
+    type y = init;
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
+    {
+        y = op(y, buffer[i]);
+    }
+    return y;
+}
+#endif
+template <index_int N, class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+{
+    auto midx = make_multi_index(idx.local, idx.nlocal());
+    // Workaround hcc, create a local array
+    auto fs = midx.id;
+    fs[0]   = n;
+    return block_reduce<N>(
+        idx, op, init, midx.for_stride(fs), [&](auto mi) __device__ { return f(mi[0]); });
+}
+constexpr index_int compute_block_size(index_int n, index_int max_block_size)
+{
+    size_t block_size = 64;
+    while(block_size < max_block_size and block_size < n)
+        block_size *= 2;
+    return block_size;
+}
+
+inline std::vector<index_int> get_reduce_lens(const std::vector<size_t>& input_lens,
+                                              const std::vector<size_t>& output_lens)
+{
+    std::vector<index_int> reduce_lens;
+    std::transform(output_lens.begin(),
+                   output_lens.end(),
+                   input_lens.begin(),
+                   std::back_inserter(reduce_lens),
+                   [](auto x, auto y) -> index_int {
+                       if(x == y)
+                           return 1;
+                       else
+                           return y;
+                   });
+    return reduce_lens;
+}
+
+template <class Op, class T, class Input, class Output>
+void reduce_multi_impl(hipStream_t stream,
+                       const argument& result,
+                       const argument& arg,
+                       Op op,
+                       T init,
+                       Input read_input,
+                       Output read_output,
+                       const shape& reduce_slice)
+{
+    hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
+        auto relements = reduce_slice.elements();
+
+        const index_int max_block_size = 256;
+        const index_int block_size     = compute_block_size(relements, max_block_size);
+        mi_launch(stream, output.get_shape(), reduce_shape, block_size)(
+            [=](auto idx, auto global, auto local) __device__ {
+                global([&](auto i) __device__ {
+                    auto r =
+                        block_reduce<max_block_size>(idx, op, init, local, [&](auto j) __device__ {
+                            return read_input(input[i + j]);
+                        });
+                    if(idx.local == 0)
+                        output[i] = read_output(r);
+                });
+            });
+    });
+}
+
+template <class Op, class T, class Input, class Output>
+void reduce_standard_impl(hipStream_t stream,
+                          const argument& result,
+                          const argument& arg,
+                          Op op,
+                          T init,
+                          Input read_input,
+                          Output read_output,
+                          index_int relements)
+{
+    hip_visit_all(result, arg)([&](auto output, auto input) {
+        auto nelements = result.get_shape().elements();
+
+        const index_int max_block_size = 256;
+        const index_int block_size     = compute_block_size(relements, max_block_size);
+        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
+            const auto out_idx  = i / block_size;
+            const auto base_idx = out_idx * relements;
+            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
+                return read_input(input.data()[base_idx + j]);
+            });
+            if(idx.local == 0)
+                output.data()[out_idx] = read_output(r);
+        });
+    });
+}
+
+template <class Op, class T, class Input, class Output>
+void reduce(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            Op op,
+            T init,
+            Input read_input,
+            Output read_output)
+{
+    auto&& output_shape = result.get_shape();
+    auto&& input_shape  = arg.get_shape();
+    auto input_lens     = input_shape.lens();
+    auto output_lens    = output_shape.lens();
+    assert(output_lens.size() == input_lens.size());
+    if(input_shape.standard() and output_shape.standard() and
+       output_lens.back() != input_lens.back() and
+       std::equal(output_lens.begin(), std::prev(output_lens.end()), input_lens.begin()))
+    {
+        reduce_standard_impl(
+            stream, result, arg, op, init, read_input, read_output, input_lens.back());
+    }
+    else
+    {
+        std::vector<index_int> reduce_lens = get_reduce_lens(input_lens, output_lens);
+        shape reduce_slice{output_shape.type(), reduce_lens};
+        reduce_multi_impl(stream, result, arg, op, init, read_input, read_output, reduce_slice);
+    }
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_NO_DPP
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
new file mode 100644
index 000000000..6bafb0d08
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
@@ -0,0 +1,111 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
+#define MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+struct sum
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+
+struct product
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+
+struct id
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+struct mean
+{
+    size_t item_num = 1;
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x / static_cast<T>(item_num);
+    }
+};
+
+struct max
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return (x > y) ? x : y;
+    }
+};
+
+struct min
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return (x < y) ? x : y;
+    }
+};
+
+struct lowest
+{
+    template <class T>
+    __device__ __host__ operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::lowest());
+    }
+};
+
+struct highest
+{
+    template <class T>
+    __device__ __host__ operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::max());
+    }
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/scan.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
new file mode 100644
index 000000000..5a66f7f73
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@@ -0,0 +1,97 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_DEVICE_SCAN_HPP
+#define MIGRAPHX_GUARD_DEVICE_SCAN_HPP
+
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/gpu/device/multi_index.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <index_int N,
+          class Op,
+          class T,
+          class ForStride,
+          class Input,
+          class Output,
+          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
+__device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input, Output output)
+{
+    using type = decltype(input(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[2][N];
+    type x = init;
+    fs([&](auto i) {
+        index_int iout = 0;
+        index_int iin  = 1;
+        if(idx.local == 0)
+            buffer[iout][idx.local] = op(input(i), x);
+        else
+            buffer[iout][idx.local] = input(i);
+        __syncthreads();
+        for(index_int s = 1; s < idx.nlocal(); s *= 2)
+        {
+            iout = 1 - iout;
+            iin  = 1 - iin;
+            if(idx.local >= s)
+            {
+                buffer[iout][idx.local] = op(buffer[iin][idx.local], buffer[iin][idx.local - s]);
+            }
+            else
+            {
+                buffer[iout][idx.local] = buffer[iin][idx.local];
+            }
+            __syncthreads();
+        }
+        x = buffer[iout][idx.nlocal() - 1];
+        output(i, buffer[iout][idx.local]);
+    });
+}
+
+template <index_int N, class Op, class T, class Input, class Output>
+__device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
+{
+    block_scan<N>(
+        idx,
+        op,
+        init,
+        [&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
+        input,
+        output);
+}
+
+template <class F>
+constexpr auto reverse_scan(index_int n, F f)
+{
+    return [=](auto i, auto&&... xs) { return f(n - i - 1, xs...); };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_SCAN_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/shape.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
new file mode 100644
index 000000000..66f065c78
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
@@ -0,0 +1,120 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+
+#include <migraphx/gpu/device/array.hpp>
+#include <migraphx/gpu/device/fast_div.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <index_int N>
+struct hip_shape
+{
+    using hip_index                  = hip_array<index_int, N>;
+    hip_index lens                   = {};
+    hip_index strides                = {};
+    hip_array<std::uint64_t, N> divs = {};
+    bool standard                    = false;
+
+    __device__ __host__ hip_shape() = default;
+
+    hip_shape(const shape& s) : standard(s.standard())
+    {
+        assert(s.lens().size() == N);
+        assert(s.strides().size() == N);
+        std::copy(s.lens().begin(), s.lens().end(), lens.begin());
+        std::copy(s.strides().begin(), s.strides().end(), strides.begin());
+        assert(std::all_of(s.lens().begin(), s.lens().end(), &is_divisor_encodable));
+        std::transform(s.lens().begin(), s.lens().end(), divs.begin(), &encode_divisor);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int elements() const { return lens.product(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int index(hip_index x) const { return x.dot(strides); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int index(std::initializer_list<index_int> x) const
+    {
+        index_int idx = 0;
+        for(index_int i = 0; i < x.size(); i++)
+            idx += *(x.begin() + i) * strides[i];
+        return idx;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int index(index_int i) const
+    {
+        if(this->standard)
+            return i;
+        else
+        {
+            const index_int rank = this->lens.size();
+            index_int s          = 1;
+            index_int result     = 0;
+            for(index_int j = 0; j < this->lens.size(); j++)
+            {
+                const index_int k      = rank - j - 1;
+                const index_int stride = this->strides[k];
+                const index_int len    = this->lens[k];
+                const index_int slen   = s * len;
+                const index_int idx    = (i % slen) / s;
+                result += stride * idx;
+                s = slen;
+            }
+            return result;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(index_int idx) const
+    {
+        hip_index result;
+        index_int tidx = idx;
+        for(std::ptrdiff_t is = result.size() - 1; is > 0; is--)
+        {
+            // result[is] = tidx % lens[is];
+            // tidx = tidx / lens[is];
+            auto q     = fast_div(tidx, divs[is]);
+            result[is] = remainder(q, tidx, lens[is]);
+            tidx       = q;
+        }
+        result[0] = tidx;
+        return result;
+    }
+};
+
+template <index_int N>
+hip_shape<N> make_hip_shape(const shape& x)
+{
+    return x;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
new file mode 100644
index 000000000..2b85cb89d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -0,0 +1,76 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
+
+#include <migraphx/gpu/device/visit.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <index_int NDim>
+using hip_tensor_index = hip_array<index_int, NDim>;
+
+template <index_int NDim>
+struct hip_tensor_descriptor
+{
+    __device__ __host__ hip_tensor_descriptor() = default;
+
+    hip_tensor_descriptor(const shape& s)
+    {
+        std::copy(s.lens().begin(), s.lens().end(), lens);
+        std::copy(s.strides().begin(), s.strides().end(), strides);
+    }
+
+    __device__ __host__ hip_tensor_index<NDim> multi(index_int idx) const
+    {
+        hip_tensor_index<NDim> result{};
+        index_int tidx = idx;
+        for(index_int is = 0; is < NDim; is++)
+        {
+            result[is] = tidx / strides[is];
+            tidx       = tidx % strides[is];
+        }
+
+        return result;
+    }
+    __device__ __host__ index_int linear(hip_tensor_index<NDim> s) const
+    {
+        index_int idx = 0;
+        for(index_int i = 0; i < NDim; i++)
+            idx += s[i] * strides[i];
+        return idx;
+    }
+    index_int lens[NDim]    = {};
+    index_int strides[NDim] = {};
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
new file mode 100644
index 000000000..8be3908a4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
@@ -0,0 +1,82 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+
+#include <migraphx/gpu/device/shape.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, index_int N>
+struct hip_tensor_view
+{
+    using value_type                      = T;
+    using hip_index                       = typename hip_shape<N>::hip_index;
+    __device__ __host__ hip_tensor_view() = default;
+    __host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
+    __host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
+
+    MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int size() const { return s.elements(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
+    {
+        return d[s.index(i)];
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
+
+    private:
+    value_type* d = nullptr;
+    hip_shape<N> s{};
+};
+
+template <index_int N, class T>
+hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
+{
+    return {x, s};
+}
+
+template <index_int N, class T>
+hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
+{
+    return {x};
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/types.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/types.hpp
new file mode 100644
index 000000000..c9f2e3d7c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -0,0 +1,213 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
+
+#include <hip/hip_runtime.h>
+#include <migraphx/half.hpp>
+#include <migraphx/bf16.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/tensor_view.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+using index_int = std::uint32_t;
+
+#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+
+template <class T, index_int N>
+using vec = T __attribute__((ext_vector_type(N)));
+
+template <index_int N, class T>
+__device__ __host__ T* as_pointer(vec<T, N>* x)
+{
+    return reinterpret_cast<T*>(x);
+}
+
+template <index_int N, class T>
+__device__ __host__ vec<T, N>* as_vec(T* x)
+{
+    return reinterpret_cast<vec<T, N>*>(x);
+}
+
+template <index_int N, class T>
+tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
+{
+    return {x.get_shape(), as_vec<N>(x.data())};
+}
+
+template <index_int N, class... Ts>
+auto pack_vec(Ts... xs)
+{
+    return [=](auto f, index_int n) { return f(as_vec<N>(xs)[n]...); };
+}
+
+using gpu_half = __fp16;
+using gpu_bf16 = __bf16;
+
+namespace detail {
+template <class T>
+struct device_type
+{
+    using type = T;
+};
+
+template <class T, index_int N>
+struct device_type<vec<T, N>>
+{
+    using type = vec<typename device_type<T>::type, N>;
+};
+
+template <>
+struct device_type<half>
+{
+    using type = gpu_half;
+};
+
+template <>
+struct device_type<bf16>
+{
+    using type = gpu_bf16;
+};
+
+template <class T>
+struct host_type
+{
+    using type = T;
+};
+
+template <>
+struct host_type<gpu_half>
+{
+    using type = half;
+};
+
+template <>
+struct host_type<gpu_bf16>
+{
+    using type = bf16;
+};
+
+} // namespace detail
+
+template <class T>
+using host_type = typename detail::host_type<T>::type;
+
+template <class T>
+using device_type = typename detail::device_type<T>::type;
+
+template <class T>
+host_type<T> host_cast(T x)
+{
+    return reinterpret_cast<const host_type<T>&>(x);
+}
+
+template <class T>
+host_type<T>* host_cast(T* x)
+{
+    return reinterpret_cast<host_type<T>*>(x);
+}
+
+template <class T>
+__device__ __host__ device_type<T> device_cast(const T& x)
+{
+    return reinterpret_cast<const device_type<T>&>(x);
+}
+
+template <class T>
+__device__ __host__ device_type<T>* device_cast(T* x)
+{
+    return reinterpret_cast<device_type<T>*>(x);
+}
+
+template <class T>
+__device__ __host__ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
+{
+    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
+}
+
+template <class T>
+__device__ __host__ T to_hip_type(T x)
+{
+    return x;
+}
+
+// Hip doens't support __fp16 and __bf16
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_bf16 x) { return x; }
+
+template <class X>
+struct is_floating_point : std::is_floating_point<X>
+{
+};
+
+template <>
+struct is_floating_point<__fp16> : std::true_type
+{
+};
+
+template <class X>
+struct is_signed : std::is_signed<X>
+{
+};
+
+template <>
+struct is_signed<__fp16> : std::true_type
+{
+};
+
+template <class X>
+struct is_arithmetic : std::is_arithmetic<X>
+{
+};
+
+template <>
+struct is_arithmetic<__fp16> : std::true_type
+{
+};
+
+// Redo for __bf16
+template <>
+struct is_floating_point<__bf16> : std::true_type
+{
+};
+template <>
+struct is_signed<__bf16> : std::true_type
+{
+};
+template <>
+struct is_arithmetic<__bf16> : std::true_type
+{
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/vector.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
new file mode 100644
index 000000000..93fe06b0c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
@@ -0,0 +1,99 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, index_int N>
+struct hip_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector(index_int s) : len(s) {}
+    template <class Iterator>
+    __device__ __host__ hip_vector(Iterator start, Iterator last)
+    {
+        auto it = std::copy(start, last, d);
+        len     = std::distance(d, it);
+    }
+
+    __device__ __host__ hip_vector(std::initializer_list<T> x)
+    {
+        std::copy(x.begin(), x.end(), d);
+        len = x.size();
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](index_int i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](index_int i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR index_int size() const { return len; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
+    {
+        d[len] = static_cast<U&&>(x);
+        len++;
+    }
+
+    private:
+    T d[N]        = {};
+    index_int len = 0;
+};
+
+template <index_int N, class T>
+hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
+{
+    hip_vector<T, N> result(x.size());
+    std::copy(x.begin(), x.end(), result.begin());
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/visit.hpp b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
new file mode 100644
index 000000000..78f28a552
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -0,0 +1,245 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+
+#include <migraphx/gpu/device/tensor_view.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class F>
+constexpr void visit_tensor_size(index_int n, F f)
+{
+    switch(n)
+    {
+    case 1: {
+        f(std::integral_constant<index_int, 1>{});
+        break;
+    }
+    case 2: {
+        f(std::integral_constant<index_int, 2>{});
+        break;
+    }
+    case 3: {
+        f(std::integral_constant<index_int, 3>{});
+        break;
+    }
+    case 4: {
+        f(std::integral_constant<index_int, 4>{});
+        break;
+    }
+    case 5: {
+        f(std::integral_constant<index_int, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Tensor dims " + std::to_string(n) + " out of range");
+    }
+}
+
+inline shape get_shape(const shape& x) { return x; }
+
+template <class T>
+auto get_shape(const T& x) -> decltype(x.get_shape())
+{
+    return x.get_shape();
+}
+
+template <class T>
+struct is_hip_type : std::false_type
+{
+};
+
+template <>
+struct is_hip_type<float> : std::true_type
+{
+};
+template <>
+struct is_hip_type<half> : std::true_type
+{
+};
+template <>
+struct is_hip_type<bool> : std::true_type
+{
+};
+template <>
+struct is_hip_type<std::int8_t> : std::true_type
+{
+};
+template <>
+struct is_hip_type<std::uint8_t> : std::true_type
+{
+};
+template <>
+struct is_hip_type<std::int32_t> : std::true_type
+{
+};
+template <>
+struct is_hip_type<bf16> : std::true_type
+{
+};
+
+template <class T, class V, MIGRAPHX_REQUIRES(is_hip_type<typename T::type>{})>
+void hip_visitor_invoke(T as, V&& v)
+{
+    v(as);
+}
+
+template <class T, class V, MIGRAPHX_REQUIRES(not is_hip_type<typename T::type>{})>
+void hip_visitor_invoke(T, V&&)
+{
+    MIGRAPHX_THROW(std::string("Unsupported data type on GPU: ") + __PRETTY_FUNCTION__);
+}
+
+template <class V>
+auto hip_visitor(V v)
+{
+    return [=](auto as) { hip_visitor_invoke(as, v); };
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
+    if(not std::all_of(
+           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
+        MIGRAPHX_THROW("Types must be the same");
+    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
+    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.ndim(), [&](auto ndim) {
+        s.visit_type(hip_visitor([&](auto as) { v(f(xs, ndim, as)...); }));
+    });
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
+    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.ndim(), [&](auto ndim) { v(f(xs, ndim)...); });
+}
+
+template <class F>
+struct hip_convert
+{
+    F f;
+    template <class RawData, class N, class As>
+    auto operator()(RawData x, N ndim, As as) const
+        -> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
+    {
+        return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
+    }
+
+    template <class N, class As>
+    auto operator()(const shape& s, N ndim, As) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert<F> make_hip_convert(F f)
+{
+    return {f};
+}
+
+template <class F>
+struct hip_convert_view
+{
+    F f;
+    template <class T, class N>
+    auto operator()(tensor_view<T> x, N ndim) const
+    {
+        return make_hip_view<ndim>(f(x));
+    }
+
+    template <class N>
+    auto operator()(const shape& s, N ndim) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert_view<F> make_hip_convert_view(F f)
+{
+    return {f};
+}
+
+template <class T, class... Ts>
+auto hip_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(
+            get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
+    };
+}
+
+template <index_int N, class T, class... Ts>
+auto hip_vec_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        auto sx   = get_shape(x);
+        auto lens = sx.lens();
+        assert(lens.back() % N == 0);
+        assert(sx.strides().back() == 1);
+        lens.back() /= N;
+        shape vec_sx{sx.type(), lens};
+        hip_visit_all_impl(vec_sx,
+                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
+                           f,
+                           x,
+                           xs...);
+    };
+}
+
+template <class T, class... Ts>
+auto hip_pointer_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
+}
+
+template <class T, class... Ts>
+auto hip_visit_views(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_views_impl(get_shape(x),
+                             make_hip_convert_view([](auto v) { return device_cast(v); }),
+                             f,
+                             x,
+                             xs...);
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/device/logsoftmax.cpp b/docker/rocm/migraphx/targets/gpu/device/logsoftmax.cpp
new file mode 100644
index 000000000..f2dd6148b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/logsoftmax.cpp
@@ -0,0 +1,80 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    auto batch_lens          = result.get_shape().lens();
+    index_int batch_item_num = batch_lens[axis];
+    batch_lens[axis]         = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const index_int max_block_size = 256;
+        const index_int block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
+
+            idx.local_stride(batch_item_num, [&](auto j) __device__ {
+                data_idx[axis]   = j;
+                output[data_idx] = input[data_idx] - log_batch_sum;
+            });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/multinomial.cpp b/docker/rocm/migraphx/targets/gpu/device/multinomial.cpp
new file mode 100644
index 000000000..e7a89d7f1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/multinomial.cpp
@@ -0,0 +1,90 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class Iterator, class T>
+constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
+{
+    Iterator it;
+    typename std::iterator_traits<Iterator>::difference_type count;
+    typename std::iterator_traits<Iterator>::difference_type step;
+    count = std::distance(first, last);
+
+    while(count > 0)
+    {
+        it   = first;
+        step = count / 2;
+        std::advance(it, step);
+        if(not(value < *it))
+        {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+    return first;
+}
+
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1)
+{
+    size_t batch_size  = arg0.get_shape().lens().front();
+    size_t class_size  = arg0.get_shape().lens().back();
+    size_t sample_size = result.get_shape().lens().back();
+
+    visit_all(arg0, arg1)([&](auto cdf_host, auto dist_host) {
+        result.visit([&](auto output_host) {
+            hip_visit_views(cdf_host, dist_host, output_host)(
+                [&](auto cdf, auto dist, auto output) {
+                    gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
+                        auto idx       = output.get_shape().multi(i);
+                        auto cdf_begin = cdf.begin() + (idx.front() * class_size);
+                        auto cdf_end   = cdf_begin + class_size;
+                        auto* sample_iter =
+                            upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                        output[i] = std::distance(cdf_begin, sample_iter);
+                    });
+                });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/nonzero.cpp b/docker/rocm/migraphx/targets/gpu/device/nonzero.cpp
new file mode 100644
index 000000000..223713390
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/nonzero.cpp
@@ -0,0 +1,77 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/device/nonzero.hpp>
+#include <migraphx/gpu/device/float_equal.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
+{
+    auto s            = arg_data.get_shape();
+    auto elem_num     = s.elements();
+    auto out_elem_num = result.get_shape().elements();
+
+    // call the prefix_sum function to do a prefix_sum to compute
+    // index in the output. Only 1 block can be used since we have
+    // only one prefix sum
+    const index_int block_size = 256;
+    hip_visit_all(arg_data, s)([&](auto input, auto si) {
+        const auto* in_ptr = device_cast(input.data());
+        auto* ptr          = result.cast<int64_t>();
+        gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
+            // fill all output to 0 first
+            idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
+
+            block_scan<block_size>(
+                idx,
+                sum{},
+                0,
+                elem_num,
+                [&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
+                [&](auto j, auto x) {
+                    auto out_loc = x - 1;
+                    if(float_equal(in_ptr[j], 0))
+                        return;
+
+                    auto index = si.multi(j);
+                    for(size_t k = 0; k < index.size(); ++k)
+                    {
+                        ptr[k * elem_num + out_loc] = index[k];
+                    }
+                });
+        });
+    });
+
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/prefix_scan_sum.cpp b/docker/rocm/migraphx/targets/gpu/device/prefix_scan_sum.cpp
new file mode 100644
index 000000000..9518f5b45
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/prefix_scan_sum.cpp
@@ -0,0 +1,143 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/device/prefix_scan_sum.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void prefix_scan_sum(hipStream_t stream,
+                     const argument& result,
+                     const argument& arg,
+                     int32_t axis,
+                     bool exclusive,
+                     bool reverse)
+{
+    const index_int max_block_size = 256;
+    const index_int n              = arg.get_shape().lens()[axis];
+    auto rlens                     = result.get_shape().lens();
+    rlens[axis]                    = 1;
+
+    hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
+        [=](auto output, auto input, auto rshape) {
+            const index_int block_size = compute_block_size(rshape.elements(), max_block_size);
+            if(reverse and exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) {
+                                if(j == n - 1)
+                                    output[compute_idx(j)] = 0;
+                                if(j > 0)
+                                    output[compute_idx(j - 1)] = x;
+                            }));
+                    });
+            }
+            else if(reverse)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) { output[compute_idx(j)] = x; }));
+                    });
+            }
+            else if(exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) {
+                                auto k = j + 1;
+                                if(j == 0)
+                                    output[compute_idx(0)] = 0;
+                                if(k < n)
+                                    output[compute_idx(k)] = x;
+                            });
+                    });
+            }
+            else
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) { output[compute_idx(j)] = x; });
+                    });
+            }
+        });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/reverse.cpp b/docker/rocm/migraphx/targets/gpu/device/reverse.cpp
new file mode 100644
index 000000000..5d5831127
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/reverse.cpp
@@ -0,0 +1,66 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "migraphx/gpu/device/visit.hpp"
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/reverse.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument
+reverse(hipStream_t stream, argument result, argument arg1, const std::vector<int64_t>& axes)
+{
+    auto s = arg1.get_shape();
+    // auto lens             = s.lens();
+    std::vector<std::size_t> axis_len(axes.begin(), axes.end());
+    shape sa{shape::float_type, axis_len};
+    std::size_t nelements = s.elements();
+    visit_all(result, arg1)([&](auto output1, auto input1) {
+        hip_visit_views(output1, input1, s)([&](auto output, auto input, auto hs) {
+            hip_visit_views(sa)([&](auto daxes) {
+                auto lens = hs.lens;
+                gs_launch(stream, nelements)([=](auto i) __device__ {
+                    auto idx    = hs.multi(i);
+                    auto in_idx = idx;
+                    for(auto axis : daxes.lens)
+                        in_idx[axis] = lens[axis] - 1 - idx[axis];
+                    output[idx] = input[in_idx];
+                });
+            });
+        });
+    });
+
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/rnn_variable_seq_lens.cpp b/docker/rocm/migraphx/targets/gpu/device/rnn_variable_seq_lens.cpp
new file mode 100644
index 000000000..6d21c702f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/rnn_variable_seq_lens.cpp
@@ -0,0 +1,140 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/device/rnn_variable_seq_lens.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/shape.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void rnn_var_sl_shift_sequence(hipStream_t stream,
+                               const argument& result,
+                               const argument& arg_hs,
+                               const argument& arg_sl)
+{
+    auto output_shape = result.get_shape();
+    int64_t max_len   = output_shape.lens()[0];
+    visit_all(result, arg_hs)([&](auto output, auto input) {
+        const auto* in_data = device_cast(input.data());
+        auto* out_data      = device_cast(output.data());
+        auto out_s          = make_hip_shape<3>(output_shape);
+        arg_sl.visit([&](auto sl) {
+            const auto* sl_data = device_cast(sl.data());
+            gs_launch(stream, output_shape.elements(), 256)([=](auto i) __device__ {
+                auto idx = out_s.multi(i);
+                auto t   = idx[0];
+                auto b   = idx[1];
+                auto l   = sl_data[b];
+                auto val = in_data[0];
+                val      = 0;
+                if(t >= max_len - l)
+                {
+                    auto in_idx = idx;
+                    in_idx[0] -= (max_len - l);
+                    val = in_data[out_s.index(in_idx)];
+                }
+                out_data[i] = val;
+            });
+        });
+    });
+}
+
+void rnn_var_sl_shift_output(hipStream_t stream,
+                             const argument& result,
+                             const argument& arg_hs,
+                             const argument& arg_sl,
+                             bool is_reverse)
+{
+    auto output_shape = result.get_shape();
+    int64_t max_len   = output_shape.lens()[0];
+    visit_all(result, arg_hs)([&](auto output, auto input) {
+        const auto* in_data = device_cast(input.data());
+        auto* out_data      = device_cast(output.data());
+        auto out_s          = make_hip_shape<4>(output_shape);
+        arg_sl.visit([&](auto sl) {
+            const auto* sl_data = device_cast(sl.data());
+            gs_launch(stream, output_shape.elements(), 256)([=](auto i) __device__ {
+                auto idx = out_s.multi(i);
+                auto t   = idx[0];
+                auto d   = idx[1];
+                auto b   = idx[2];
+                auto l   = sl_data[b];
+                auto val = in_data[0];
+                val      = 0;
+                if(t < l)
+                {
+                    int offset  = (d == 1 or is_reverse) ? 1 : 0;
+                    auto in_idx = idx;
+                    in_idx[0] += offset * (max_len - l);
+                    val = in_data[out_s.index(in_idx)];
+                }
+                out_data[i] = val;
+            });
+        });
+    });
+}
+
+void rnn_var_sl_last_output(hipStream_t stream,
+                            const argument& result,
+                            const argument& arg_hs,
+                            const argument& arg_sl,
+                            bool is_reverse)
+{
+    auto input_shape   = arg_hs.get_shape();
+    auto out_comp_lens = input_shape.lens();
+    out_comp_lens[0]   = 1;
+    shape out_comp_shape{input_shape.type(), out_comp_lens};
+
+    visit_all(result, arg_hs)([&](auto output, auto input) {
+        const auto* in_data = device_cast(input.data());
+        auto* out_data      = device_cast(output.data());
+        arg_sl.visit([&](auto sl) {
+            const auto* sl_data = device_cast(sl.data());
+            auto in_s           = make_hip_shape<4>(input_shape);
+            auto out_s          = make_hip_shape<4>(out_comp_shape);
+            gs_launch(stream, result.get_shape().elements(), 256)([=](auto i) __device__ {
+                auto idx = out_s.multi(i);
+                auto d   = idx[1];
+                auto b   = idx[2];
+                auto l   = sl_data[b];
+                if(is_reverse or d == 1)
+                {
+                    idx[0] = 0;
+                }
+                else
+                {
+                    idx[0] = l - 1;
+                }
+                out_data[i] = in_data[in_s.index(idx)];
+            });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/targets.cpp b/docker/rocm/migraphx/targets/gpu/device/targets.cpp
new file mode 100644
index 000000000..0b1853db7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/targets.cpp
@@ -0,0 +1,66 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/gpu/device/targets.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/errors.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+static std::vector<std::string> parse_targets() { return split_string(MIGRAPHX_GPU_TARGETS, ';'); }
+
+const std::vector<std::string>& get_targets()
+{
+    static auto result = parse_targets();
+    return result;
+}
+
+std::string get_targets_as_string() { return join_strings(get_targets(), ", "); }
+
+static int get_device_id()
+{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("No device");
+    return device;
+}
+
+std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    auto status = hipGetDeviceProperties(&props, get_device_id());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to get device properties");
+    return props.gcnArchName;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device/targets.hpp.in b/docker/rocm/migraphx/targets/gpu/device/targets.hpp.in
new file mode 100644
index 000000000..0a0e19aba
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/targets.hpp.in
@@ -0,0 +1,52 @@
+/*
+* The MIT License (MIT)
+*
+* Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*/
+#ifndef MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+#define MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+#include <migraphx/gpu/device/config.hpp>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+#define MIGRAPHX_GPU_TARGETS "@GPU_TARGETS@" // NOLINT
+
+MIGRAPHX_DEVICE_EXPORT
+const std::vector<std::string>& get_targets();
+
+MIGRAPHX_DEVICE_EXPORT
+std::string get_targets_as_string();
+
+MIGRAPHX_DEVICE_EXPORT
+std::string get_device_name();
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+
diff --git a/docker/rocm/migraphx/targets/gpu/device/topk.cpp b/docker/rocm/migraphx/targets/gpu/device/topk.cpp
new file mode 100644
index 000000000..2168af94e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device/topk.cpp
@@ -0,0 +1,239 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/topk.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/ranges.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, class Index, class Compare>
+struct hip_heap_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_heap_vector(T* val, index_int n, Index v_idx, Compare comp)
+        : data(val), size(n), data_index(v_idx), compare(comp)
+    {
+        make_heap(size);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR void try_push(const T val)
+    {
+        if(compare(val, data[data_index(0)]))
+            return;
+
+        pop_heap(size - 1);
+        data[data_index(size - 1)] = val;
+        push_heap(size - 1);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR void sort() { sort_heap(size); }
+
+    private:
+    MIGRAPHX_DEVICE_CONSTEXPR inline static void swap(T& v1, T& v2) noexcept
+    {
+        T v = v1;
+        v1  = v2;
+        v2  = v;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_down(index_int n, index_int index)
+    {
+        while(index < n)
+        {
+            auto pre_index = index;
+            index_int l    = 2 * index + 1;
+            index_int r    = 2 * index + 2;
+
+            if(l < n and compare(data[data_index(l)], data[data_index(index)]))
+            {
+                index = l;
+            }
+
+            if(r < n and compare(data[data_index(r)], data[data_index(index)]))
+            {
+                index = r;
+                if(compare(data[data_index(l)], data[data_index(r)]))
+                {
+                    index = l;
+                }
+            }
+
+            if(index == pre_index)
+            {
+                break;
+            }
+
+            swap(data[data_index(index)], data[data_index(pre_index)]);
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_up(index_int index)
+    {
+        while(index > 0)
+        {
+            auto parent_idx = (index - 1) / 2;
+
+            if(not compare(data[data_index(index)], data[data_index(parent_idx)]))
+            {
+                break;
+            }
+
+            swap(data[data_index(index)], data[data_index(parent_idx)]);
+            index = parent_idx;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void make_heap(index_int n)
+    {
+        for(int j = n / 2 - 1; j >= 0; --j)
+        {
+            heapify_down(n, j);
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void push_heap(index_int loc) { heapify_up(loc); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void pop_heap(index_int loc)
+    {
+        swap(data[data_index(0)], data[data_index(loc)]);
+        heapify_down(loc, 0);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void sort_heap(index_int n)
+    {
+        for(int j = n - 1; j > 0; --j)
+        {
+            swap(data[data_index(0)], data[data_index(j)]);
+            heapify_down(j, 0);
+        }
+    }
+
+    T* data = nullptr;
+    index_int size;
+    Index data_index;
+    Compare compare;
+};
+
+template <class T, class Index, class Compare>
+__device__ hip_heap_vector<T, Index, Compare>
+make_heap(T* data, index_int n, Index idx, Compare compare)
+{
+    return {data, n, idx, compare};
+}
+
+template <class Compare>
+std::vector<argument> topk(hipStream_t stream,
+                           const argument& val_res,
+                           const argument& ind_res,
+                           const argument& arg,
+                           int64_t k,
+                           int64_t axis,
+                           Compare compare)
+{
+    auto in_s       = arg.get_shape();
+    auto in_lens    = in_s.lens();
+    auto out_s      = val_res.get_shape();
+    auto axis_dim   = in_s.lens()[axis];
+    auto comp_lens  = in_lens;
+    comp_lens[axis] = 1;
+    shape comp_s{in_s.type(), comp_lens};
+    std::size_t elem_num = comp_s.elements();
+
+    hip_visit_all(val_res, arg, out_s, in_s, comp_s)(
+        [&](auto out_val, auto input, auto oss, auto iss, auto css) {
+            auto* data      = device_cast(input.data());
+            auto* out       = device_cast(out_val.data());
+            auto* const ind = ind_res.cast<int64_t>();
+            gs_launch(stream, elem_num)([=](auto i) __device__ {
+                auto idx = css.multi(i);
+
+                auto in_idx = [&](int ii) {
+                    auto iidx  = idx;
+                    iidx[axis] = ii;
+                    return iss.index(iidx);
+                };
+
+                auto out_idx = [&](int ii) {
+                    auto iidx  = idx;
+                    iidx[axis] = ii;
+                    return oss.index(iidx);
+                };
+
+                auto data_compare = [=](auto ii, auto jj) {
+                    return compare(data[in_idx(ii)], data[in_idx(jj)]);
+                };
+
+                for(int j = 0; j < k; ++j)
+                {
+                    ind[out_idx(j)] = j;
+                }
+
+                auto hp = make_heap(ind, k, out_idx, data_compare);
+                for(int j = k; j < axis_dim; ++j)
+                {
+                    hp.try_push(j);
+                }
+                hp.sort();
+
+                for(int j = 0; j < k; ++j)
+                {
+                    out[out_idx(j)] = data[in_idx(ind[out_idx(j)])];
+                }
+            });
+        });
+
+    return {val_res, ind_res};
+}
+
+argument topk_largest(hipStream_t stream,
+                      const argument& val_res,
+                      const argument& ind_res,
+                      const argument& arg,
+                      int64_t k,
+                      int64_t axis)
+{
+    return {topk(stream, val_res, ind_res, arg, k, axis, std::less<>{})};
+}
+
+argument topk_smallest(hipStream_t stream,
+                       const argument& val_res,
+                       const argument& ind_res,
+                       const argument& arg,
+                       int64_t k,
+                       int64_t axis)
+{
+    return {topk(stream, val_res, ind_res, arg, k, axis, std::greater<>{})};
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/device_name.cpp b/docker/rocm/migraphx/targets/gpu/device_name.cpp
new file mode 100644
index 000000000..c717742e2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/device_name.cpp
@@ -0,0 +1,68 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/errors.hpp>
+#include <migraphx/rank.hpp>
+#include <migraphx/stringutils.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+int get_device_id()
+{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("No device");
+    return device;
+}
+
+std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    auto status = hipGetDeviceProperties(&props, get_device_id());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to get device properties");
+    return props.gcnArchName;
+}
+
+bool gfx_has_fp8fnuz_intrinsics()
+{
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    return (starts_with(device_name, "gfx94"));
+}
+
+bool gfx_has_fp8ocp_intrinsics()
+{
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    bool is_navi_with_fp8ocp = starts_with(device_name, "gfx12") and device_name >= "gfx1200";
+    bool is_mi_with_fp8ocp   = starts_with(device_name, "gfx9") and device_name >= "gfx950";
+    return (is_navi_with_fp8ocp or is_mi_with_fp8ocp);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/driver/CMakeLists.txt b/docker/rocm/migraphx/targets/gpu/driver/CMakeLists.txt
new file mode 100644
index 000000000..ae9b9a685
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/CMakeLists.txt
@@ -0,0 +1,31 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+file(GLOB GPU_DRIVER_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+add_executable(gpu-driver
+    ${GPU_DRIVER_SRCS}
+)
+rocm_clang_tidy_check(gpu-driver)
+target_include_directories(gpu-driver PRIVATE include)
+target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
diff --git a/docker/rocm/migraphx/targets/gpu/driver/action.cpp b/docker/rocm/migraphx/targets/gpu/driver/action.cpp
new file mode 100644
index 000000000..ea71afdf1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/action.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/action.hpp>
+#include <migraphx/errors.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+auto& action_map()
+{
+    static std::unordered_map<std::string, action_function> m;
+    return m;
+}
+
+action_function get_action(const std::string& name)
+{
+    if(action_map().count(name) == 0)
+        MIGRAPHX_THROW("Missing action: " + name);
+    return action_map().at(name);
+}
+
+void register_action(const std::string& name, const action_function& a) { action_map()[name] = a; }
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/driver/compile_op.cpp b/docker/rocm/migraphx/targets/gpu/driver/compile_op.cpp
new file mode 100644
index 000000000..5caae2a79
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/compile_op.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/action.hpp>
+#include <migraphx/gpu/time_op.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+struct compile_op : action<compile_op>
+{
+    static void apply(const parser& p, const value& v)
+    {
+        context ctx;
+        auto inputs = p.parse_shapes(v.at("inputs"));
+        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
+        auto t      = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << " -> " << op.compute_shape(inputs) << ": " << t << "ms" << std::endl;
+        std::cout << std::endl;
+    }
+};
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
new file mode 100644
index 000000000..172419e7c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
@@ -0,0 +1,60 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
+#define MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/auto_register.hpp>
+#include <migraphx/type_name.hpp>
+#include <migraphx/gpu/driver/parser.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+using action_function = std::function<void(const parser&, const value&)>;
+
+action_function get_action(const std::string& name);
+void register_action(const std::string& name, const action_function& a);
+
+struct auto_register_action
+{
+    template <class T>
+    static void apply()
+    {
+        const auto& name = get_type_name<T>();
+        register_action(name.substr(name.rfind("::") + 2),
+                        [](auto&&... xs) { T::apply(std::forward<decltype(xs)>(xs)...); });
+    }
+};
+
+template <class T>
+using action = auto_register<auto_register_action, T>;
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_DRIVER_ACTION_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/parser.hpp b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/parser.hpp
new file mode 100644
index 000000000..d5995eeb5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/include/migraphx/gpu/driver/parser.hpp
@@ -0,0 +1,68 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
+#define MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
+
+#include <migraphx/value.hpp>
+#include <migraphx/shape.hpp>
+
+#include <unordered_map>
+#include <functional>
+#include <vector>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+[[noreturn]] void error(const std::string& msg);
+
+struct parser
+{
+    parser() = default;
+
+    template <class T>
+    T get(const value& v, const std::string& key, const T& default_value) const
+    {
+        return v.get(key, settings.get(key, default_value));
+    }
+
+    shape parse_shape(const value& v) const;
+
+    std::vector<shape> parse_shapes(const value& v) const;
+
+    void load_settings(const value& v);
+
+    static void process(const value& v);
+
+    private:
+    value settings = value::object{};
+};
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_DRIVER_PARSER_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/driver/main.cpp b/docker/rocm/migraphx/targets/gpu/driver/main.cpp
new file mode 100644
index 000000000..c61e447db
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/main.cpp
@@ -0,0 +1,44 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/parser.hpp>
+#include <migraphx/json.hpp>
+#include <migraphx/convert_to_json.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <iostream>
+
+using namespace migraphx;              // NOLINT
+using namespace migraphx::gpu;         // NOLINT
+using namespace migraphx::gpu::driver; // NOLINT
+
+int main(int argc, char const* argv[])
+{
+    std::vector<std::string> args(argv, argv + argc);
+    if(args.size() < 2)
+    {
+        std::cout << "Usage: gpu-driver <input-file>" << std::endl;
+        std::abort();
+    }
+    auto v = from_json_string(convert_to_json(read_string(args[1])));
+    parser::process(v);
+}
diff --git a/docker/rocm/migraphx/targets/gpu/driver/parser.cpp b/docker/rocm/migraphx/targets/gpu/driver/parser.cpp
new file mode 100644
index 000000000..c84d00580
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/parser.cpp
@@ -0,0 +1,81 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/parser.hpp>
+#include <migraphx/gpu/driver/action.hpp>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+[[noreturn]] void error(const std::string& msg)
+{
+    std::cout << msg << std::endl;
+    std::abort();
+}
+
+shape parser::parse_shape(const value& v) const
+{
+    auto lens    = get(v, "lens", std::vector<std::size_t>{});
+    auto strides = get(v, "strides", std::vector<std::size_t>{});
+    auto type    = shape::parse_type(get<std::string>(v, "type", "float"));
+    if(strides.empty())
+        return shape{type, lens};
+    else
+        return shape{type, lens, strides};
+}
+
+std::vector<shape> parser::parse_shapes(const value& v) const
+{
+    std::vector<shape> result;
+    std::transform(
+        v.begin(), v.end(), std::back_inserter(result), [&](auto&& x) { return parse_shape(x); });
+    return result;
+}
+
+void parser::load_settings(const value& v)
+{
+    if(v.contains("settings"))
+        settings = v.at("settings");
+}
+
+void parser::process(const value& v)
+{
+    if(not v.is_object())
+        error("Input is not an object");
+    parser p{};
+    p.load_settings(v);
+    for(auto&& pp : v)
+    {
+        if(pp.get_key() == "settings")
+            continue;
+        get_action(pp.get_key())(p, pp.without_key());
+    }
+}
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/driver/precompile_op.cpp b/docker/rocm/migraphx/targets/gpu/driver/precompile_op.cpp
new file mode 100644
index 000000000..2aec2a2d3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/precompile_op.cpp
@@ -0,0 +1,84 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/action.hpp>
+#include <migraphx/gpu/time_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/compile_ops.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+struct precompile_op : action<precompile_op>
+{
+    static program create_preop_program(const operation& preop, std::vector<shape> inputs)
+    {
+        program p;
+        auto* mm = p.get_main_module();
+        std::vector<instruction_ref> args;
+        inputs.pop_back();
+        transform(inputs, range(inputs.size()), std::back_inserter(args), [&](auto input, auto i) {
+            return mm->add_parameter("x" + std::to_string(i), input);
+        });
+        mm->add_instruction(preop, args);
+        return p;
+    }
+
+    static operation get_code_object(const program& p)
+    {
+        MIGRAPHX_TIDY_CONST auto* mm = p.get_main_module();
+        auto it                      = std::find_if(mm->begin(), mm->end(), [](const auto& ins) {
+            return (ins.name() == "gpu::code_object");
+        });
+        if(it == mm->end())
+            MIGRAPHX_THROW("Failed to create code object");
+        return it->get_operator();
+    }
+    static void apply(const parser& p, const value& v)
+    {
+        context ctx;
+        auto inputs = p.parse_shapes(v.at("inputs"));
+        auto name   = v.at("name").to<std::string>();
+        auto preop  = make_op(name);
+        if(v.contains("fields"))
+            preop.from_value(v.at("fields"));
+        bool exhaustive = v.get("exhaustive", false);
+        auto prog       = create_preop_program(preop, inputs);
+        run_passes(prog, {lowering{}, compile_ops{&ctx, exhaustive}});
+        auto op = get_code_object(prog);
+        auto t  = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << preop << ": " << t << "ms" << std::endl;
+    }
+};
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/driver/run_op.cpp b/docker/rocm/migraphx/targets/gpu/driver/run_op.cpp
new file mode 100644
index 000000000..d5575a933
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/driver/run_op.cpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/driver/action.hpp>
+#include <migraphx/gpu/time_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+struct run_op : action<run_op>
+{
+    static void apply(const parser& p, const value& v)
+    {
+        context ctx;
+        auto inputs = p.parse_shapes(v.at("inputs"));
+        auto name   = v.at("name").to<std::string>();
+        if(not contains(name, "::"))
+            name = "gpu::" + name;
+        auto op = make_op(name);
+        if(v.contains("fields"))
+            op.from_value(v.at("fields"));
+        auto t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << " -> " << op.compute_shape(inputs) << ": " << t << "ms" << std::endl;
+    }
+};
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/fuse_ck.cpp b/docker/rocm/migraphx/targets/gpu/fuse_ck.cpp
new file mode 100644
index 000000000..bf9a269f3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/fuse_ck.cpp
@@ -0,0 +1,217 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/gpu/gemm_softmax_gemm.hpp>
+#include <migraphx/gpu/device_name.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+
+namespace gpu {
+
+struct ck_gemm
+{
+    operation op = make_op("dot");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::ck_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1))
+            MIGRAPHX_THROW("Invalid shape for ck_gemm");
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW(name() + ": should have at least two inputs.");
+        auto a = inputs[0];
+        auto b = inputs[1];
+        for(const auto& input : inputs)
+            check_gemm_shape(input);
+        auto r = op.compute_shape({a, b});
+        if(mods.empty())
+            return r;
+        return r.with_type(mods.front()->get_output_shapes().front().type());
+    }
+
+    static bool is_ck_supported_type(shape::type_t t)
+    {
+        return contains({shape::half_type, shape::int8_type, shape::int32_type}, t);
+    }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm);
+
+struct ck_gemm_softmax_gemm : gemm_softmax_gemm
+{
+    std::string name() const { return "gpu::ck_gemm_softmax_gemm"; }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm_softmax_gemm);
+
+namespace {
+
+MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+{
+    if(ins->name() != "dot" and ins->name() != "quant_dot")
+        return false;
+    if(not ck_gemm::is_ck_supported_type(ins->get_shape().type()))
+        return false;
+    auto a = ins->inputs().front()->get_shape();
+    auto b = ins->inputs().back()->get_shape();
+    auto m = a.lens()[a.lens().size() - 2];
+    auto n = b.lens().back();
+    auto k = a.lens().back();
+    auto batch_size = std::accumulate(
+        a.lens().rbegin() + 2, a.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
+    // Integer gemms must be divisible by 4 in ck
+    if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
+    {
+        if(m % 4 != 0)
+            return false;
+        if(n % 4 != 0)
+            return false;
+        if(k % 4 != 0)
+            return false;
+    }
+    auto device_name = trim(split_string(get_device_name(), ':').front());
+    if(starts_with(device_name, "gfx94"))
+    {
+        if(ins->get_shape().type() == shape::half_type)
+        {
+            if(batch_size >= 64)
+                return m < 2048 or k <= 64 or n <= 384 or n >= 2048;
+            return true;
+        }
+        return true;
+    }
+    return k <= 2048;
+}
+
+struct find_ck_gemm_pointwise
+{
+    // Find a gemm followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto gemm = match::skip(match::name("contiguous"))(
+            match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")));
+        return match::name("pointwise")(match::any_of[match::inputs()](gemm.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        auto inputs   = ins->inputs();
+        auto gemm_it  = std::find(inputs.begin(), inputs.end(), x_ins);
+        auto gemm_idx = gemm_it - inputs.begin();
+        if(gemm_ins->get_shape().type() != shape::int32_type and
+           ins->get_shape().type() != gemm_ins->get_shape().type())
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not ck_gemm::is_ck_supported_type(input->get_shape().type());
+           }))
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
+           }))
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
+           }))
+            return;
+        assert(gemm_it != inputs.end());
+        if(gemm_idx != 0)
+        {
+            auto first_param    = pm->get_parameter(names[0]);
+            auto gemm_param     = pm->get_parameter(names[gemm_idx]);
+            auto new_gemm_param = pm->add_parameter(names[0] + "_0", gemm_param->get_shape());
+            auto new_first_param =
+                pm->add_parameter(names[gemm_idx] + "_0", first_param->get_shape());
+            pm->replace_instruction(gemm_param, new_gemm_param);
+            pm->replace_instruction(first_param, new_first_param);
+            pm->remove_instruction(first_param);
+            pm->remove_instruction(gemm_param);
+        }
+        inputs.erase(gemm_it);
+        inputs.insert(inputs.begin(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+
+        mpm.get_module().replace_instruction(ins, ck_gemm{gemm_ins->get_operator()}, inputs, {pm});
+    }
+};
+
+struct find_ck_gemm
+{
+    auto matcher() const { return match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        mpm.get_module().replace_instruction(ins, ck_gemm{ins->get_operator()}, ins->inputs());
+    }
+};
+
+struct find_ck_gemm_softmax_gemm
+{
+    auto matcher() const { return match::name("gpu::pre_gemm_softmax_gemm"); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        auto v   = ins->get_operator().to_value();
+        assert(v.contains("scale"));
+        auto scale = v.at("scale").to<float>();
+        mpm.get_module().replace_instruction(
+            ins, ck_gemm_softmax_gemm{migraphx::make_op("dot"), scale}, ins->inputs());
+    }
+};
+
+} // namespace
+
+void fuse_ck::apply(module_pass_manager& mpm) const
+{
+    match::find_matches(mpm, find_ck_gemm_softmax_gemm{}, find_ck_gemm_pointwise{});
+    match::find_matches(mpm, find_ck_gemm{});
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/fuse_mlir.cpp b/docker/rocm/migraphx/targets/gpu/fuse_mlir.cpp
new file mode 100644
index 000000000..519955c21
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/fuse_mlir.cpp
@@ -0,0 +1,1106 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/common.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/output_iterator.hpp>
+#include <migraphx/param_utils.hpp>
+#include <migraphx/match/softmax.hpp>
+#include <migraphx/fp8_types.hpp>
+#include <optional>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_EXTRA_MLIR);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MLIR);
+/**
+ * @brief Declares a new MIGraphX environment variable which forces to generate
+ * only specific MLIR operations.
+ *
+ * The variable, if defined, forces MIGraphX to use only specific operations
+ * with MLIR regardless of the underlying GPU architecture. The variable accepts
+ * a list of operations separated by comma. The variable recognizes the following
+ * operations: "fused", "convolution", "dot". If the variable is not defined MIGraphX
+ * will decide by itself which operations to delegate to MLIR. The variable is
+ * intended to be primarily used by rocMLIR developers.
+ */
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_USE_SPECIFIC_OPS);
+
+bool mlir_enabled()
+{
+#ifdef MIGRAPHX_MLIR
+    const bool mlir_disabled = enabled(MIGRAPHX_DISABLE_MLIR{});
+    return not mlir_disabled;
+#else
+    return false;
+#endif
+}
+
+namespace {
+struct requested
+{
+};
+struct rejected
+{
+};
+} // namespace
+
+static bool is_negated_op(const std::string& s)
+{
+    if(s.empty())
+        return false;
+    return contains({'!', '~'}, s[0]);
+}
+
+template <class Action>
+static std::vector<std::string> get_usage()
+{
+    static const auto options =
+        split_string(string_value_of(MIGRAPHX_MLIR_USE_SPECIFIC_OPS{}, ""), ',');
+    static const bool enabled = std::is_same<Action, requested>{};
+    std::vector<std::string> result;
+    auto remove_not_symbol = [&](const std::string& s) {
+        if(is_negated_op(s))
+            return s.substr(1);
+        return s;
+    };
+    transform_if(
+        options.begin(),
+        options.end(),
+        std::back_inserter(result),
+        [&](const std::string& option) {
+            if(option.empty())
+                return false;
+            if(is_negated_op(option))
+                return not enabled;
+            return enabled;
+        },
+        remove_not_symbol);
+    return result;
+}
+
+template <class Action>
+static bool specific_op(std::string_view option, bool fallback = false)
+{
+    static const auto options = get_usage<Action>();
+    if(options.empty())
+        return fallback;
+    if(contains(option, "fused") and contains(options, "fused"))
+        return true;
+    return contains(options, option);
+}
+
+bool mlir_attention_enabled(context* ctx)
+{
+#ifdef MIGRAPHX_MLIR
+    if(not mlir_enabled())
+        return false;
+    if(specific_op<rejected>("attention"))
+        return false;
+    // Enable attention by default for mi300
+    if(ctx != nullptr and starts_with(ctx->get_current_device().get_gfx_name(), "gfx94"))
+        return true;
+    return specific_op<requested>("attention");
+#else
+    return false;
+#endif
+}
+
+#ifdef MIGRAPHX_MLIR
+
+struct mlir_op
+{
+    std::string name() const { return "gpu::mlir_op"; }
+    operation op = make_op("convolution");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    // Check if the shape can be created from a transpose/broadcast/slice
+    static bool is_mlir_compatible(const shape& s)
+    {
+        if(s.standard() or s.packed() or s.scalar() or s.ndim() == 1)
+            return true;
+        auto ns = reorder_shape(s, find_permutation(s));
+        std::vector<std::size_t> stride_ratios;
+        auto last = std::find(ns.strides().begin(), ns.strides().end(), 0);
+        if(*std::prev(last) != 1)
+            return false;
+        std::adjacent_difference(ns.strides().begin(),
+                                 last,
+                                 std::back_inserter(stride_ratios),
+                                 [](auto y, auto x) -> std::size_t {
+                                     assert(y != 0);
+                                     if((x % y) != 0)
+                                         return 0;
+                                     return x / y;
+                                 });
+        return std::equal(stride_ratios.begin() + 1,
+                          stride_ratios.end(),
+                          ns.lens().begin() + 1,
+                          [](auto ratio, auto len) { return ratio >= len; });
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs, const std::vector<module_ref>& mods) const
+    {
+        module_ref mod = mods[0];
+        check_shapes{inputs, *this}.has_at_least(1);
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+
+        if(not std::all_of(inputs.begin(), inputs.end(), &is_mlir_compatible))
+            MIGRAPHX_THROW("Shape is not mlir compatible.");
+
+        auto result =
+            mod->compute_shapes(inputs, {.name = name(), .strict_type = true, .strict_lens = true});
+        if(result.size() == 1)
+            return result.front();
+        return shape{result};
+    }
+};
+MIGRAPHX_REGISTER_OP(mlir_op);
+
+namespace {
+
+const auto& reshaper_names()
+{
+    // clang-format off
+    static const std::unordered_set<std::string> names = {
+        "slice",
+        "transpose",
+        "multibroadcast",
+        "broadcast",
+        "contiguous",
+        "reshape",
+        "lazy_reshape",
+        "squeeze",
+        "flatten",
+        "unsqueeze"
+    };
+    // clang-format on
+    return names;
+}
+
+std::tuple<instruction_ref, std::vector<operation>>
+get_fusable_input_op_stream(instruction_ref lower_input)
+{
+    instruction_ref upper_input = lower_input;
+    std::vector<operation> op_stream;
+    while(contains(reshaper_names(), upper_input->name()))
+    {
+        operation op = upper_input->get_operator();
+        op_stream.push_back(op);
+        upper_input = upper_input->inputs().at(0);
+    }
+    return {upper_input, op_stream};
+}
+
+void fuse_input_ops(module_ref mm,
+                    const std::vector<instruction_ref>& inputs,
+                    std::unordered_map<instruction_ref, instruction_ref>* map_ins)
+{
+    assert(map_ins != nullptr);
+    size_t input_cnt = mm->get_parameters().size();
+    for(instruction_ref input : inputs)
+    {
+        if(contains(*map_ins, input))
+            continue;
+        auto [upper_input, op_stream] = get_fusable_input_op_stream(input);
+        if(not contains(*map_ins, upper_input))
+            (*map_ins)[upper_input] =
+                mm->add_parameter(param_name(input_cnt++), upper_input->get_shape().as_standard());
+        instruction_ref prev_input = (*map_ins)[upper_input];
+        for(const auto& op : reverse(op_stream))
+        {
+            prev_input = mm->add_instruction(op, {prev_input});
+        }
+        (*map_ins)[input] = prev_input;
+    }
+}
+
+std::tuple<instruction_ref, std::vector<instruction_ref>>
+fuse_input_ops_and_gemm_based_op(module_ref mm,
+                                 const std::vector<instruction_ref>& gemm_based_op_inputs,
+                                 const operation& gemm_based_op)
+{
+    std::vector<instruction_ref> top_inputs;
+    std::vector<instruction_ref> imm_inputs;
+    size_t input_cnt = 0;
+    for(instruction_ref input : gemm_based_op_inputs)
+    {
+        auto [upper_input, op_stream] = get_fusable_input_op_stream(input);
+        top_inputs.push_back(upper_input);
+        instruction_ref prev_input =
+            mm->add_parameter(param_name(input_cnt++, "y"), upper_input->get_shape().as_standard());
+        for(const auto& op : reverse(op_stream))
+        {
+            prev_input = mm->add_instruction(op, {prev_input});
+        }
+        imm_inputs.push_back(prev_input);
+    }
+    instruction_ref new_gemm_based_op = mm->add_instruction(gemm_based_op, imm_inputs);
+    return {new_gemm_based_op, top_inputs};
+}
+
+enum class mlir_mode
+{
+    all,
+    fast,
+    int8,
+    none
+};
+
+auto is_mlir_dot(mlir_mode mode)
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(mode == mlir_mode::none)
+            return false;
+        if(ins->name() != "dot" and ins->name() != "quant_dot")
+            return false;
+        // dot operation where (FP8 * FP8 = FP8) is not available in MLIR. rocBLAS/hipBLASLt should
+        // have the support for it.
+        if(contains(fp8_types{}.get(), ins->get_shape().type()))
+            return false;
+        if(mode != mlir_mode::fast)
+            return true;
+        auto a = ins->inputs().front()->get_shape();
+        auto b = ins->inputs().back()->get_shape();
+        // auto m = a.lens()[a.lens().size() - 2];
+        // auto n = b.lens().back();
+        auto k = a.lens().back();
+        // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
+        // to avoid poor-performing GEMM kernels from MLIR
+        // To-do: Investigate a more precise strategy
+        return k <= 1024;
+    });
+}
+
+auto is_mlir_conv(mlir_mode mode)
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(mode == mlir_mode::none)
+            return false;
+        if(ins->name() != "convolution" and ins->name() != "quant_convolution")
+            return false;
+        auto input = ins->inputs().front()->get_shape();
+        value v    = ins->get_operator().to_value();
+        auto group = v.at("group").to<int>();
+        // Avoid MLIR assertion: Index < Length && "Invalid index!"
+        if(ins->get_shape().lens().size() != 4 and group > 1)
+            return false;
+        std::set<shape::type_t> supported_types = fp8_types{}.get();
+        supported_types.insert(shape::int8_type);
+        if(contains(supported_types, input.type()))
+            return true;
+        if(mode == mlir_mode::all)
+            return true;
+        // No winograd for group convolution
+        if(group > 1)
+            return true;
+        auto w = ins->inputs().at(1)->get_shape();
+        if(w.lens().size() != 4)
+            return true;
+        if(w.lens()[2] != w.lens()[3])
+            return true;
+        return (w.lens()[3] % 3) != 0;
+    });
+}
+
+std::unordered_map<instruction_ref, instruction_ref>
+create_param_map_with_literals(module_ref mm, const module* pm, const shape& shape)
+{
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
+    for(auto ins : iterator_for(*pm))
+    {
+        if(ins->name() != "@literal")
+        {
+            continue;
+        }
+        literal r               = ins->get_literal();
+        instruction_ref literal = mm->add_literal(r);
+        instruction_ref mbcast =
+            mm->add_instruction(make_op("multibroadcast", {{"out_lens", shape.lens()}}), literal);
+        ins_map[ins] = mbcast;
+    }
+    return ins_map;
+}
+
+instruction_ref unroll_pointwise(module& main_mod,
+                                 instruction_ref pos,
+                                 const operation& op,
+                                 const std::vector<instruction_ref>& inputs,
+                                 const std::vector<module_ref>& mod_args)
+{
+    if(op.name() == "pointwise")
+    {
+        auto* sub_pm     = mod_args.front();
+        auto param_map_2 = create_param_map_with_literals(
+            &main_mod, sub_pm, op.compute_shape(to_shapes(inputs), mod_args));
+        return main_mod.insert_inline(pos, *sub_pm, inputs, &param_map_2)
+            .front(); // cppcheck-suppress returnDanglingLifetime;
+    }
+    return main_mod.insert_instruction(pos, op, inputs, mod_args);
+}
+
+// Whitelist supported fusion options, including imposing type constraints
+// for cases where MLIR only supports an operation (usually a pointwise function)
+// on particular types.
+bool is_pointwise_op_supported_by_mlir(const instruction& i)
+{
+    using type_t                                      = shape::type_t;
+    const auto& name                                  = i.name();
+    const auto result_type                            = i.get_shape().type();
+    const std::initializer_list<type_t> allowed_types = {type_t::float_type,
+                                                         type_t::bf16_type,
+                                                         type_t::half_type,
+                                                         type_t::fp8e4m3fnuz_type,
+                                                         type_t::fp8e5m2fnuz_type,
+                                                         type_t::fp8e4m3fn_type,
+                                                         type_t::fp8e5m2_type,
+                                                         type_t::int8_type,
+                                                         type_t::uint8_type,
+                                                         type_t::int32_type,
+                                                         type_t::uint32_type,
+                                                         type_t::bool_type};
+    // Preliminary type check.
+    if(not contains(allowed_types, result_type))
+    {
+        return false;
+    }
+    const std::initializer_list<std::string> any_type_ops = {"@literal", "@param", "@return"};
+    const std::initializer_list<std::string> no_bool_ops  = {
+        "convolution",
+        "quant_convolution",
+        "dot",
+        "quant_dot",
+        "add",
+        "clip",
+        "relu",
+        "sub",
+        "mul",
+        "div",
+        "pow",
+        "where",
+        "quantizelinear",
+        "dequantizelinear",
+        "abs",
+        "neg",
+    };
+    const std::initializer_list<std::string> fp_only_ops = {
+        "ceil",
+        "erf",
+        "exp",
+        "floor",
+        "log",
+        "recip",
+        "sqrt",
+        "rsqrt",
+        "sigmoid",
+        "softmax",
+        "tanh",
+    };
+    std::set<shape::type_t> float_types = {type_t::float_type,
+                                           type_t::half_type,
+                                           type_t::bf16_type,
+                                           type_t::fp8e4m3fnuz_type,
+                                           type_t::fp8e5m2fnuz_type,
+                                           type_t::fp8e4m3fn_type,
+                                           type_t::fp8e5m2_type};
+    bool is_float                       = contains(float_types, result_type);
+    if(contains(any_type_ops, name))
+        return true;
+    if(result_type != type_t::bool_type and contains(no_bool_ops, name))
+        return true;
+    if(is_float and contains(fp_only_ops, name))
+        return true;
+    // Only conversions between floating types are known to be unambigiously
+    // supported.
+    if(is_float and name == "convert")
+    {
+        if(contains(fp8_types{}.get(), result_type))
+        {
+            return false;
+        } // else
+        return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
+            return contains({type_t::float_type, type_t::half_type, type_t::bf16_type},
+                            arg->get_shape().type());
+        });
+    }
+    return false;
+}
+
+bool is_reduce_op_supported_by_mlir(const instruction& i)
+{
+    using type_t                                      = shape::type_t;
+    const auto& name                                  = i.name();
+    const auto result_type                            = i.get_shape().type();
+    const std::initializer_list<type_t> allowed_types = {type_t::float_type,
+                                                         type_t::half_type,
+                                                         type_t::bf16_type,
+                                                         type_t::fp8e4m3fnuz_type,
+                                                         type_t::fp8e5m2fnuz_type,
+                                                         type_t::fp8e4m3fn_type,
+                                                         type_t::fp8e5m2_type};
+
+    // Preliminary type check.
+    if(not contains(allowed_types, result_type))
+    {
+        return false;
+    }
+    const std::initializer_list<std::string> reduce_ops = {"reduce_mean", "reduce_sum"};
+    return contains(reduce_ops, i.name());
+}
+
+// A separate function so we can remove operators that are supported by mlir
+// but not supported for an input fusion.
+bool is_pointwise_op_supported_by_mlir_for_input(const instruction& i)
+{
+    return is_pointwise_op_supported_by_mlir(i);
+}
+
+MIGRAPHX_PRED_MATCHER(mlir_split_reduce, instruction_ref ins)
+{
+    if(ins->name() != "split_fused_reduce")
+        return false;
+    auto* mod_arg           = ins->module_inputs().front();
+    auto supported_reshapes = reshaper_names();
+    supported_reshapes.erase("slice");
+    std::unordered_set<std::string> builtins = {"@param", "@literal", "@return"};
+    for(const auto i : iterator_for(*mod_arg))
+    {
+        if(is_reduce(*i))
+        {
+            if(not is_reduce_op_supported_by_mlir(*i))
+                return false;
+        }
+        else if(i->name() == "pointwise")
+        {
+            if(not std::all_of(i->module_inputs().front()->begin(),
+                               i->module_inputs().front()->end(),
+                               &is_pointwise_op_supported_by_mlir))
+                return false;
+        }
+        else if(not contains(reshaper_names(), i->name()) and not contains(builtins, i->name()))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+MIGRAPHX_PRED_MATCHER(mlir_pointwise, instruction_ref ins)
+{
+    if(ins->name() != "pointwise")
+        return false;
+    auto* pm = ins->module_inputs().front();
+    return std::all_of(pm->begin(), pm->end(), &is_pointwise_op_supported_by_mlir);
+}
+
+MIGRAPHX_PRED_MATCHER(mlir_input_pointwise, instruction_ref ins)
+{
+    if(ins->name() != "pointwise")
+        return false;
+    auto* pm = ins->module_inputs().front();
+    return std::all_of(pm->begin(), pm->end(), &is_pointwise_op_supported_by_mlir_for_input);
+}
+
+std::vector<instruction_ref> mlir_contiguous(module_pass_manager& mpm,
+                                             const std::vector<instruction_ref>& inputs)
+{
+    std::vector<instruction_ref> result;
+    std::transform(
+        inputs.begin(), inputs.end(), std::back_inserter(result), [&](instruction_ref input) {
+            if(input->get_shape().packed() or input->get_shape().broadcasted())
+                return input;
+            return mpm.get_module().insert_instruction(
+                std::next(input), make_op("contiguous"), input);
+        });
+    return result;
+}
+
+struct find_mlir_split_reduce
+{
+    mlir_mode conv_mode = mlir_mode::none;
+    mlir_mode dot_mode  = mlir_mode::none;
+    auto matcher() const
+    {
+        auto dot_or_conv = match::name("gpu::mlir_op");
+        // TODO: Handle reshapes inbetween
+        return mlir_split_reduce()(match::any_of[match::inputs()](dot_or_conv.bind("gemm")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce_ins = r.result;
+        auto gemm_ins   = r.instructions["gemm"];
+        assert(gemm_ins->get_shape().sub_shapes().empty());
+        auto* rm   = reduce_ins->module_inputs().front();
+        auto names = rm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        module_ref gemm_old_mm = gemm_ins->module_inputs().front();
+        module_ref mm = mpm.create_module(gemm_old_mm->name() + "_" + rm->name(), *gemm_old_mm);
+        // remove last return instruction
+        if(std::prev(mm->end())->name() == "@return")
+        {
+            mm->remove_instruction(std::prev(mm->end()));
+        }
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        param_map[gemm_ins]      = std::prev(mm->end());
+        bool gemm_has_multi_outs = gemm_ins->outputs().size() > 1;
+        auto return_vals = mm->fuse(*rm, reduce_ins->inputs(), &param_map, &unroll_pointwise);
+        if(gemm_has_multi_outs)
+        {
+            return_vals.insert(return_vals.end(), param_map[gemm_ins]);
+        }
+        mm->add_return(return_vals);
+        std::vector<instruction_ref> inputs;
+        std::copy_if(reduce_ins->inputs().begin(),
+                     reduce_ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != gemm_ins; });
+        inputs.insert(inputs.end(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+        if(gemm_has_multi_outs)
+        {
+            auto fused_ins = mpm.get_module().insert_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+            auto dot_ins = mpm.get_module().insert_instruction(
+                reduce_ins,
+                migraphx::make_op("get_tuple_elem", {{"index", return_vals.size() - 1}}),
+                fused_ins);
+
+            mpm.get_module().replace_instruction(gemm_ins, dot_ins);
+            for(const auto& outs : reduce_ins->outputs())
+            {
+                assert(outs->get_operator().name() == "get_tuple_elem");
+                mpm.get_module().replace_instruction(outs, outs->get_operator(), fused_ins);
+            }
+        }
+        else
+        {
+            mpm.get_module().replace_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+        }
+    }
+};
+
+struct find_mlir_fused_ops
+{
+    mlir_mode conv_mode = mlir_mode::none;
+    mlir_mode dot_mode  = mlir_mode::none;
+    auto matcher() const
+    {
+        auto reshapes = reshaper_names();
+        // slice is not supported
+        reshapes.erase("slice");
+        auto dot_or_conv = match::skip(match::name(reshapes))(
+            match::any_of(is_mlir_dot(dot_mode), is_mlir_conv(conv_mode)).bind("gemm_based_op"));
+        return mlir_pointwise()(match::any_of[match::inputs()](dot_or_conv.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto pw_ins        = r.result;
+        auto gemm_based_op = r.instructions["gemm_based_op"];
+        auto x_ins         = r.instructions["x"]; // input to pointwise after reshaper op stream
+        auto* pm           = pw_ins->module_inputs().front();
+        auto pw_inputs     = pw_ins->inputs();
+        // only of one of the inputs to pointwise module should be dependent on conv/gemm that is
+        // being fused, otherwise it can create invalid graph transformation
+        if(std::any_of(pw_inputs.begin(), pw_inputs.end(), [&](const auto& i) {
+               return i != x_ins and reaches(gemm_based_op, i);
+           }))
+            return;
+        auto names = pm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        module_ref mm = mpm.create_module("mlir_" + pm->name());
+        mm->set_bypass();
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(
+            mm, gemm_based_op->inputs(), gemm_based_op->get_operator());
+        std::unordered_map<instruction_ref, instruction_ref> param_map =
+            create_param_map_with_literals(mm, pm, pw_ins->get_shape());
+        auto [upper_input, op_stream] = get_fusable_input_op_stream(x_ins);
+        assert(upper_input == gemm_based_op);
+        auto prev_input = anchor_op;
+        for(const auto& op : reverse(op_stream))
+        {
+            prev_input = mm->add_instruction(op, {prev_input});
+        }
+        assert(prev_input->get_shape().lens() == x_ins->get_shape().lens());
+        param_map[x_ins] = prev_input; // this is to avoid adding parameter for gemm/conv reshaped
+                                       // input to pointwise in new fused module
+        bool gemm_has_multi_outs = gemm_based_op->outputs().size() > 1;
+        auto reshaped_gemm       = x_ins;
+        std::vector<instruction_ref> reshapes_vec;
+        while(reshaped_gemm != gemm_based_op)
+        {
+            reshapes_vec.push_back(reshaped_gemm);
+            gemm_has_multi_outs = gemm_has_multi_outs or reshaped_gemm->outputs().size() > 1;
+            reshaped_gemm       = reshaped_gemm->inputs().at(0);
+        }
+        reshapes_vec.push_back(reshaped_gemm);
+
+        auto return_vals = mm->fuse(*pm, pw_ins->inputs(), &param_map);
+        if(gemm_has_multi_outs)
+        {
+            return_vals.insert(return_vals.begin(), anchor_op);
+        }
+        mm->add_return(return_vals);
+
+        std::vector<instruction_ref> inputs;
+        std::copy_if(pw_ins->inputs().begin(),
+                     pw_ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != x_ins; });
+        inputs.insert(inputs.end(), top_inputs.begin(), top_inputs.end());
+        if(gemm_has_multi_outs)
+        {
+            auto fused_ins = mpm.get_module().insert_instruction(
+                pw_ins, mlir_op{gemm_based_op->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+            mpm.get_module().replace_instruction(
+                pw_ins, migraphx::make_op("get_tuple_elem", {{"index", 1}}), fused_ins);
+            auto dot_ins = mpm.get_module().insert_instruction(
+                pw_ins, migraphx::make_op("get_tuple_elem", {{"index", 0}}), fused_ins);
+            // move all the reshape instructions and original GEMM instruction after the fused op to
+            // avoid generating invalid migraphx program
+            for(const auto& orig_i : reverse(reshapes_vec))
+            {
+                mpm.get_module().move_instruction(orig_i, pw_ins);
+            }
+            mpm.get_module().replace_instruction(gemm_based_op, dot_ins);
+        }
+        else
+        {
+            mpm.get_module().replace_instruction(
+                pw_ins, mlir_op{gemm_based_op->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+        }
+    }
+};
+
+template <auto Matcher>
+struct find_mlir_standalone_op
+{
+    mlir_mode mode       = mlir_mode::none;
+    std::size_t* counter = nullptr;
+    auto matcher() const { return Matcher(mode); }
+
+    std::string get_count() const
+    {
+        if(counter == nullptr)
+            MIGRAPHX_THROW("Invalid counter");
+        return std::to_string((*counter)++);
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto gemm_based_op = r.result;
+        // enable only for fp32/fp16/i8/fp8 types
+        if(std::any_of(gemm_based_op->inputs().begin(), gemm_based_op->inputs().end(), [&](auto i) {
+               return not contains({shape::type_t::float_type,
+                                    shape::type_t::half_type,
+                                    shape::type_t::bf16_type,
+                                    shape::type_t::int8_type,
+                                    shape::type_t::fp8e4m3fnuz_type,
+                                    shape::type_t::fp8e5m2fnuz_type,
+                                    shape::type_t::fp8e4m3fn_type,
+                                    shape::type_t::fp8e5m2_type},
+                                   i->get_shape().type());
+           }))
+            return;
+        std::string module_name = "mlir_" + gemm_based_op->name() + get_count();
+        if(mpm.get_module().name() != "main")
+            module_name = mpm.get_module().name() + ":" + module_name;
+        module_ref mm = mpm.create_module(module_name);
+        mm->set_bypass();
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(
+            mm, gemm_based_op->inputs(), gemm_based_op->get_operator());
+        mm->add_return({anchor_op});
+        mpm.get_module().replace_instruction(gemm_based_op,
+                                             mlir_op{gemm_based_op->get_operator()},
+                                             mlir_contiguous(mpm, top_inputs),
+                                             {mm});
+    }
+};
+
+using find_mlir_standalone_convolution_op = find_mlir_standalone_op<&is_mlir_conv>;
+using find_mlir_standalone_dot_op         = find_mlir_standalone_op<&is_mlir_dot>;
+
+struct find_mlir_standalone_attention_op
+{
+    mlir_mode dot_mode = mlir_mode::none;
+
+    auto matcher() const
+    {
+        auto gemm1 =
+            match::skip(match::name("contiguous"))(match::used_once(), is_mlir_dot(dot_mode))
+                .bind("gemm1");
+        auto fused_reduce =
+            match::name("fused_reduce")(match::used_once(),
+                                        match::any_of[match::inputs()](
+                                            match::skip(match::name("reshape").bind("rsp"))(gemm1)))
+                .bind("fused_reduce");
+        return is_mlir_dot(dot_mode)(match::arg(0)(fused_reduce)).bind("gemm2");
+    }
+
+    std::unordered_map<instruction_ref, instruction_ref>
+    invert_map_ins(const std::unordered_map<instruction_ref, instruction_ref>& map_ins) const
+    {
+        std::unordered_map<instruction_ref, instruction_ref> inverse_map;
+        for(auto const& [key, value] : map_ins)
+        {
+            assert(not contains(inverse_map, value));
+            inverse_map[value] = key;
+        }
+        return inverse_map;
+    }
+
+    auto finalize_attention_module(module_ref m) const
+    {
+        eliminate_common_subexpression{}.apply(*m);
+        dead_code_elimination{}.apply(*m);
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto gemm2        = r.instructions["gemm2"];
+        auto fused_reduce = r.instructions["fused_reduce"];
+        auto gemm1        = r.instructions["gemm1"];
+
+        auto axes = fused_reduce->get_operator().to_value()["axes"];
+        if(axes.size() != 1)
+            return;
+
+        module m_attn;
+        std::unordered_map<instruction_ref, instruction_ref> map_main_to_mattn;
+
+        // Add first gemm and fuse any input shape ops
+        module fuse_gemm1;
+        auto [anchor_op, top_inputs] =
+            fuse_input_ops_and_gemm_based_op(&fuse_gemm1, gemm1->inputs(), gemm1->get_operator());
+        fuse_gemm1.add_return({anchor_op});
+        m_attn.add_params(top_inputs, &map_main_to_mattn);
+
+        std::unordered_map<instruction_ref, instruction_ref> map_gemm1_to_mattn(map_main_to_mattn);
+        auto m_gemm1             = m_attn.fuse(fuse_gemm1, top_inputs, &map_gemm1_to_mattn).front();
+        map_main_to_mattn[gemm1] = m_gemm1;
+
+        if(contains(r.instructions, "rsp"))
+        {
+            auto rsp               = r.instructions["rsp"];
+            auto m_rsp             = m_attn.add_instruction(rsp->get_operator(), {m_gemm1});
+            map_main_to_mattn[rsp] = m_rsp;
+        }
+        // Add pointwise-softmax, unroll any pointwise modules back to base ops
+        m_attn.add_params(fused_reduce->inputs(), &map_main_to_mattn);
+        std::unordered_map<instruction_ref, instruction_ref> map_mfr_to_mattn(map_main_to_mattn);
+        auto pw_softmax = m_attn
+                              .fuse(*fused_reduce->module_inputs().front(),
+                                    fused_reduce->inputs(),
+                                    &map_mfr_to_mattn,
+                                    &unroll_pointwise)
+                              .front();
+
+        // fused_reduce submodule should end with a softmax
+        auto result = match::match_instruction(m_attn, pw_softmax, match::softmax());
+        if(result.result != pw_softmax)
+            return;
+
+        // Insert explict softmax op - required for MLIR
+        auto softmax_in = result.instructions["x"];
+        auto softmax    = m_attn.insert_instruction(
+            std::next(softmax_in), make_op("softmax", {{"axis", axes.front()}}), softmax_in);
+        map_main_to_mattn[fused_reduce] = softmax;
+
+        // all preceeding ops should be fusable ops
+        if(not std::all_of(m_gemm1, softmax, [](auto i) {
+               return (is_pointwise_op_supported_by_mlir(i) or
+                       contains(reshaper_names(), i.name()));
+           }))
+            return;
+
+        // Add second gemm and fuse any input shape ops
+        module fuse_gemm2;
+        auto [anchor_op2, top_inputs2] =
+            fuse_input_ops_and_gemm_based_op(&fuse_gemm2, gemm2->inputs(), gemm2->get_operator());
+        fuse_gemm2.add_return({anchor_op2});
+        m_attn.add_params(top_inputs2, &map_main_to_mattn);
+
+        std::unordered_map<instruction_ref, instruction_ref> map_gemm2_to_mattn(map_main_to_mattn);
+        auto m_gemm2 = m_attn.fuse(fuse_gemm2, top_inputs2, &map_gemm2_to_mattn).front();
+        map_main_to_mattn[gemm2] = m_gemm2;
+
+        // Fuse any succeeding pointwise module
+        if(contains(r.instructions, "trailing_pm"))
+        {
+            auto trailing_pm_ins = r.instructions["trailing_pm"];
+            auto lit_map         = create_param_map_with_literals(
+                &m_attn, trailing_pm_ins->module_inputs().front(), trailing_pm_ins->get_shape());
+            m_attn.add_params(trailing_pm_ins->inputs(), &map_main_to_mattn);
+            map_main_to_mattn.insert(lit_map.begin(), lit_map.end());
+            std::unordered_map<instruction_ref, instruction_ref> map_pm_to_mattn(map_main_to_mattn);
+            auto fused_pw_outs = m_attn
+                                     .fuse(*trailing_pm_ins->module_inputs().front(),
+                                           trailing_pm_ins->inputs(),
+                                           &map_pm_to_mattn)
+                                     .front();
+            map_main_to_mattn[trailing_pm_ins] = fused_pw_outs;
+            m_attn.add_return({fused_pw_outs});
+        }
+        else
+        {
+            m_attn.add_return({m_gemm2});
+        }
+
+        finalize_attention_module(&m_attn);
+        auto map_mattn_to_main = invert_map_ins(map_main_to_mattn);
+        auto new_inputs        = m_attn.get_inputs(map_mattn_to_main);
+
+        module_ref mpm_attn = mpm.create_module(
+            "mlir_attn_" + fused_reduce->module_inputs().front()->name(), std::move(m_attn));
+        mpm_attn->set_bypass();
+
+        mpm.get_module().replace_instruction(
+            r.result, mlir_op{gemm1->get_operator()}, mlir_contiguous(mpm, new_inputs), {mpm_attn});
+    }
+};
+
+struct find_mlir_attention_fused_ops : public find_mlir_standalone_attention_op
+{
+    auto matcher() const
+    {
+        auto standalone_matcher = find_mlir_standalone_attention_op::matcher();
+        return mlir_pointwise()(
+            match::any_of[match::inputs()](standalone_matcher).bind("trailing_pm"));
+        ;
+    }
+};
+
+struct find_pointwise_mlir
+{
+    auto supported_pointwise() const { return mlir_input_pointwise(match::used_once()); }
+
+    auto matcher() const
+    {
+        return match::name("gpu::mlir_op")(match::any_of[match::inputs()](supported_pointwise()));
+    }
+
+    static bool is_simple_op(const_module_ref pm, std::initializer_list<std::string> op_names)
+    {
+        auto last = std::prev(pm->end());
+        assert(last->name() == "@return");
+        if(last->inputs().size() != 1)
+            return false;
+        auto rins   = last->inputs().front();
+        auto op_ins = std::find_if(pm->begin(), pm->end(), [](const instruction& x) {
+            return not contains({"@param", "@literal", "broadcast", "multibroadcast"}, x.name());
+        });
+        if(op_ins != rins)
+            return false;
+        return contains(op_names, op_ins->name());
+    }
+
+    static instruction_ref insert_pointwise(module& m,
+                                            instruction_ref ins,
+                                            const operation& op,
+                                            const std::vector<instruction_ref>& inputs,
+                                            const std::vector<module_ref>& mod_args)
+    {
+        // Only used in assert
+        (void)mod_args;
+        assert(mod_args.empty());
+        return insert_common_op(m, ins, op, inputs, {.common_type = false});
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+
+        auto* mm = ins->module_inputs().front();
+        std::vector<instruction_ref> pws;
+        std::copy_if(
+            ins->inputs().begin(),
+            ins->inputs().end(),
+            std::back_inserter(pws),
+            [&](instruction_ref input) {
+                if(not match::instruction_matches(mpm.get_module(), input, supported_pointwise()))
+                    return false;
+                auto* pm = input->module_inputs().front();
+                if(input->inputs().size() > 1 and not is_simple_op(pm, {"dequantizelinear"}))
+                {
+                    if(not enabled(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION{}))
+                        return false;
+                }
+                return true;
+            });
+        if(pws.empty())
+            return;
+
+        std::string module_name;
+        std::transform(
+            pws.begin(), pws.end(), join_back_inserter(module_name), [](instruction_ref pw) {
+                return pw->module_inputs().front()->name() + ":";
+            });
+        module_name += mm->name();
+        module_ref m = mpm.create_module(module_name);
+        m->set_bypass();
+
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        for(auto pw : pws)
+        {
+            auto* pm = pw->module_inputs().front();
+            fuse_input_ops(m, pw->inputs(), &map_ins);
+            auto rins   = m->fuse(*pm, pw->inputs(), &map_ins, &insert_pointwise).front();
+            map_ins[pw] = rins;
+        }
+
+        auto ret = m->fuse(*mm, ins->inputs(), &map_ins);
+        m->add_return({ret});
+
+        auto inputs = find_inputs(map_ins, &mpm.get_module(), m);
+        mpm.get_module().replace_instruction(
+            ins, ins->get_operator(), mlir_contiguous(mpm, inputs), {m});
+    }
+};
+
+struct find_unpack_int4_mlir_op
+{
+    auto matcher() const
+    {
+        return match::name("gpu::mlir_op")(
+            match::any_of[match::inputs()](match::name("unpack_int4").bind("unpack_int4")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto* mm      = ins->module_inputs().front();
+        module_ref nm = mpm.create_module("int4:" + mm->name());
+        nm->set_bypass();
+
+        std::vector<instruction_ref> x_in;
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        int ct = 0;
+
+        for(auto input : ins->inputs())
+        {
+            if(input->get_operator().name() == "unpack_int4")
+            {
+                auto unpack_input = input->inputs()[0];
+                instruction_ref t_ins =
+                    nm->add_parameter(param_name(++ct), unpack_input->get_shape().as_standard());
+                map_ins[input] = nm->add_instruction(input->get_operator(), t_ins);
+                x_in.push_back(unpack_input);
+            }
+            else
+            {
+                map_ins[input] =
+                    nm->add_parameter(param_name(++ct), input->get_shape().as_standard());
+                x_in.push_back(input);
+            }
+        }
+        auto ret = nm->fuse(*mm, ins->inputs(), &map_ins);
+        nm->add_return({ret});
+        mpm.get_module().replace_instruction(ins, ins->get_operator(), x_in, {nm});
+    }
+};
+
+} // namespace
+
+#endif // MIGRAPHX_MLIR
+
+void fuse_mlir::apply(module_pass_manager& mpm) const
+{
+#ifdef MIGRAPHX_MLIR
+    std::size_t counter     = 0;
+    const auto& device_name = ctx == nullptr ? "" : ctx->get_current_device().get_gfx_name();
+    const bool is_navi = starts_with(device_name, "gfx11") or starts_with(device_name, "gfx12");
+
+    auto get_mode = [&](std::string_view option, mlir_mode m1, mlir_mode m2 = mlir_mode::fast) {
+        if(specific_op<rejected>(option))
+            return mlir_mode::none;
+        if(specific_op<requested>(option))
+            return mlir_mode::all;
+        if(is_navi)
+            return mlir_mode::all;
+        return std::max(m1, m2);
+    };
+
+    // Attention offloads; default disabled
+    if(mlir_attention_enabled(ctx) or enable_extra)
+    {
+        match::find_matches(mpm, find_mlir_attention_fused_ops{mlir_mode::all});
+        mpm.run_pass(dead_code_elimination{});
+        match::find_matches(mpm, find_mlir_standalone_attention_op{mlir_mode::all});
+        mpm.run_pass(dead_code_elimination{});
+    }
+
+    match::find_matches(
+        mpm,
+        find_mlir_fused_ops{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
+                            .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+
+    match::find_matches(
+        mpm,
+        find_mlir_standalone_convolution_op{.mode    = get_mode("convolution", mlir_mode::fast),
+                                            .counter = &counter},
+        find_mlir_standalone_dot_op{.mode = get_mode("dot", mlir_mode::fast), .counter = &counter});
+
+    mpm.run_pass(dead_code_elimination{});
+    if(enabled(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION{}))
+    {
+        match::find_matches(
+            mpm,
+            find_mlir_split_reduce{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
+                                   .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+    }
+
+    match::find_matches(mpm, find_pointwise_mlir{});
+    match::find_matches(mpm, find_unpack_int4_mlir_op{});
+
+#else
+    (void)mpm;
+#endif
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/fuse_ops.cpp b/docker/rocm/migraphx/targets/gpu/fuse_ops.cpp
new file mode 100644
index 000000000..5e93ccf5e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/fuse_ops.cpp
@@ -0,0 +1,1060 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/compile_hipblaslt.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/gpu/gemm.hpp>
+#include <migraphx/gpu/hip_gemm.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/array.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/make_op.hpp>
+#include <cmath>
+#include <set>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPBLASLT_GEMM)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MIOPEN_FUSION)
+#if MIGRAPHX_USE_MIOPEN
+struct fusion
+{
+    using op_t = miopenFusionOpDescriptor_t;
+    shared<fusion_plan_descriptor> fp;
+
+    // Used as a temporary hack to keep descriptor references alive
+    std::vector<std::shared_ptr<void>> storage;
+
+    template <class T>
+    auto keep_alive(T x)
+    {
+        auto result = share(std::move(x));
+        storage.push_back(result);
+        return result;
+    }
+
+    fusion() = default;
+
+    fusion(const shape& input)
+    {
+        assert(input.standard());
+        auto t = make_tensor(input);
+        fp     = make_fusion_plan(t);
+        assert(fp);
+        keep_alive(std::move(t));
+    }
+
+    bool empty() const { return fp == nullptr; }
+
+    op_t operator[](std::size_t i) const
+    {
+        assert(fp);
+        op_t result;
+        auto status = miopenFusionPlanGetOp(fp.get(), i, &result);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("Failed retrieving operator at " + std::to_string(i));
+        return result;
+    }
+
+    auto get() const
+    {
+        assert(fp);
+        return fp.get();
+    }
+
+    op_t create_bias(const shape& bias)
+    {
+        assert(fp);
+        op_t result;
+        auto b      = shape{bias.type(), {1, bias.lens().at(1), 1, 1}};
+        auto t      = keep_alive(make_tensor(b));
+        auto status = miopenCreateOpBiasForward(fp.get(), &result, t.get());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("Creating operator failed");
+        return result;
+    }
+
+    op_t create_relu()
+    {
+        assert(fp);
+        op_t result;
+        auto status = miopenCreateOpActivationForward(fp.get(), &result, miopenActivationRELU);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("Creating operator failed");
+        return result;
+    }
+
+    op_t create_conv(const op::convolution& op, const shape& weights)
+    {
+        assert(fp);
+        op_t result;
+        auto cd     = keep_alive(make_conv(op));
+        auto t      = keep_alive(make_tensor(weights));
+        auto status = miopenCreateOpConvForward(fp.get(), &result, cd.get(), t.get());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("Creating operator failed");
+        return result;
+    }
+
+    shape get_workspace(context&)
+    {
+        // assert(fp);
+        // TODO: Use zero workspace for now
+        std::size_t ws_size = 0;
+        // int algo_count = 1;
+        // miopenConvFwdAlgorithm_t algo;
+        // miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo);
+        // miopenFusionPlanGetWorkSpaceSize(ctx.get_stream().get_miopen(), fp.get(), &ws_size,
+        // algo);
+        return shape{shape::int8_type, {ws_size}};
+    }
+
+    bool compile(context& ctx)
+    {
+        assert(fp);
+        return miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get()) ==
+               miopenStatusSuccess;
+    }
+
+    argument execute(context& ctx,
+                     const fused_operator_args& fargs,
+                     const argument& x,
+                     const argument& y) const
+    {
+        assert(fp);
+        auto x_td   = make_tensor(x.get_shape());
+        auto y_td   = make_tensor(y.get_shape());
+        auto status = miopenExecuteFusionPlan(ctx.get_stream().get_miopen(),
+                                              fp.get(),
+                                              x_td.get(),
+                                              x.implicit(),
+                                              y_td.get(),
+                                              y.implicit(),
+                                              fargs.get());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("Failed to execute fusion plan");
+        return y;
+    }
+};
+#endif
+
+const std::unordered_set<std::string>& get_supported_archs()
+{
+    static std::unordered_set<std::string> supported_archs{
+        "gfx900", "gfx906", "gfx908", "gfx1030", "gfx940"};
+    return supported_archs;
+}
+#if MIGRAPHX_USE_MIOPEN
+MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
+{
+    auto&& s = ins->get_shape();
+    return s.broadcasted() and s.strides().size() == 4 and s.strides()[0] == 0 and
+           s.strides()[1] != 0 and s.strides()[2] == 0 and s.strides()[3] == 0;
+}
+
+MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
+{
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    if(not contains(get_supported_archs(), device_name))
+        return false;
+    if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{}))
+        return false;
+    if(ins->name() != "gpu::convolution")
+        return false;
+    if(ins->get_shape().type() != shape::float_type)
+        return false;
+    auto wei = ins->inputs().at(1)->get_shape();
+    assert(wei.lens().size() == 4);
+    auto miopen_conv_op = ins->get_operator().to_value();
+    auto algo           = miopen_conv_op.at("algo").to<miopenConvFwdAlgorithm_t>();
+    auto conv_op        = from_value<op::convolution>(miopen_conv_op["op"]);
+    if(conv_op.group > 1)
+        return false;
+    if(wei.lens()[1] > 512 and algo != miopenConvolutionFwdAlgoWinograd)
+        return false;
+
+    // Do not fuse non-symmetric input
+    auto input_lens = ins->inputs().at(0)->get_shape().lens();
+    if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
+        return false;
+
+    // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
+    if(algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and wei.lens()[3] != 3 and
+       contains({{1, 1}}, conv_op.stride))
+        return false;
+    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, conv_op.padding) and
+           contains({{0, 0}, {1, 1}}, conv_op.stride) and contains({{1, 1}}, conv_op.dilation);
+}
+#endif
+
+void move_broadcasted_back(std::vector<instruction_ref>& args)
+{
+    // Ensure the last arguments is the broadcasted one
+    auto last = std::prev(args.end());
+    auto it =
+        std::find_if(args.begin(), last, [](auto arg) { return arg->get_shape().broadcasted(); });
+    if(it != last)
+        std::swap(*it, *std::prev(last));
+}
+
+void move_standard_front(std::vector<instruction_ref>& args)
+{
+    // Ensure the first arguments is the standard one
+    auto last = std::prev(args.end());
+    auto it =
+        std::find_if(args.begin(), last, [](auto arg) { return arg->get_shape().standard(); });
+    if(it != last)
+        std::swap(*it, args.front());
+}
+
+auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }
+
+namespace {
+#if MIGRAPHX_USE_MIOPEN
+struct miopen_fusion
+{
+    struct fuse_op_data
+    {
+        operation op;
+        float alpha = 1;
+        float beta  = 0;
+    };
+    struct fuse_op : fuse_op_data, reflect_equality<fuse_op>, reflect_stream<fuse_op>
+    {
+        template <class Self, class F>
+        static auto reflect(Self& self, F f)
+        {
+            return pack(f(self.op, "op"), f(self.alpha, "alpha"), f(self.beta, "beta"));
+        }
+    };
+    std::vector<fuse_op> ops = {};
+    fusion f                 = {};
+    std::function<void(context&, const fusion&, const std::vector<argument>&)> execute;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.ops, "ops"));
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
+    value compile(context& ctx, const shape&, std::vector<shape> inputs)
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        std::size_t i = 0;
+        f             = fusion(inputs[i]);
+        i++;
+        std::vector<std::function<void(const fused_operator_args&, const std::vector<argument>&)>>
+            invokers;
+        for(auto&& fop : ops)
+        {
+            if(i > inputs.size())
+            {
+                f = {};
+                return {};
+            }
+            if(fop.op.name() == "convolution")
+            {
+                auto* mop = f.create_conv(any_cast<op::convolution>(fop.op), inputs[i]);
+                invokers.push_back(
+                    [=](const fused_operator_args& fargs, const std::vector<argument>& args) {
+                        miopenSetOpArgsConvForward(
+                            fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
+                    });
+                i++;
+            }
+            else if(fop.op.name() == "add")
+            {
+                auto* mop = f.create_bias(inputs[i]);
+                invokers.push_back(
+                    [=](const fused_operator_args& fargs, const std::vector<argument>& args) {
+                        miopenSetOpArgsBiasForward(
+                            fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
+                    });
+                i++;
+            }
+            else if(fop.op.name() == "relu")
+            {
+                auto* mop = f.create_relu();
+                invokers.push_back([=](const fused_operator_args& fargs,
+                                       const std::vector<argument>&) {
+                    miopenSetOpArgsActivForward(fargs.get(), mop, &fop.alpha, &fop.beta, 0, 0, 0);
+                });
+            }
+            else
+            {
+                f = {};
+                return {};
+            }
+        }
+        if(not f.compile(ctx))
+        {
+            f = {};
+            return {};
+        }
+        execute = [invokers](context& c, const fusion& ff, const std::vector<argument>& args) {
+            auto fargs = make_fused_args();
+            for(auto&& invoker : invokers)
+                invoker(fargs, args);
+            ff.execute(c, fargs, args.front(), args.back());
+        };
+        return {{"workspace", f.get_workspace(ctx).bytes()}};
+    }
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        if(not f.empty())
+            return;
+        auto v = compile(ctx, output_shape, inputs);
+        if(not v.is_object())
+            MIGRAPHX_THROW("Failed to compile fusion plan");
+    }
+    std::string name() const { return "gpu::miopen_fusion"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(ops.empty())
+            return {};
+        // TODO: Check number of arguments
+        return ops.front().op.compute_shape({inputs[0], inputs[1]});
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        execute(ctx, f, args);
+        return args.back();
+    }
+};
+MIGRAPHX_REGISTER_OP(miopen_fusion)
+
+struct miopen_conv_bias
+{
+    op::convolution op;
+    fusion fp         = {};
+    fusion::op_t conv = {};
+    fusion::op_t bias = {};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::convolution::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::conv_bias"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(5);
+        // TODO: Check slices
+        return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        auto fargs  = make_fused_args();
+        float alpha = 1;
+        float beta  = 0;
+        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
+        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
+        return fp.execute(ctx, fargs, args[0], args[4]);
+    }
+
+    void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
+    {
+        fp   = fusion(inputs[0]);
+        conv = fp.create_conv(op, inputs[1]);
+        bias = fp.create_bias(inputs[3]);
+        if(not fp.compile(ctx))
+            MIGRAPHX_THROW("Failed to compile fusion plan");
+    }
+
+    shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(miopen_conv_bias)
+
+struct miopen_conv_bias_relu
+{
+    op::convolution op;
+    fusion fp         = {};
+    fusion::op_t conv = {};
+    fusion::op_t bias = {};
+    fusion::op_t relu = {};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::convolution::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::conv_bias_relu"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(5);
+        // TODO: Check slices
+        return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        auto fargs  = make_fused_args();
+        float alpha = 1;
+        float beta  = 0;
+        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
+        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
+        miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
+        return fp.execute(ctx, fargs, args[0], args[4]);
+    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
+    {
+        fp   = fusion(inputs[0]);
+        conv = fp.create_conv(op, inputs[1]);
+        bias = fp.create_bias(inputs[3]);
+        relu = fp.create_relu();
+        fp.compile(ctx);
+    }
+
+    shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(miopen_conv_bias_relu)
+
+template <class... Ms>
+auto conv_bias(Ms... ms)
+{
+    return match::name("gpu::add")(
+        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
+                                fusable_conv(match::used_once()).bind("conv")),
+        ms...);
+}
+
+template <class Op>
+void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
+{
+    auto conv_ins    = r.instructions["conv"];
+    auto bias_ins    = r.instructions["bias"];
+    auto ins         = r.result;
+    auto input_ins   = conv_ins->inputs().at(0);
+    auto weights_ins = conv_ins->inputs().at(1);
+    auto conv_op     = from_value<op::convolution>((conv_ins->get_operator()).to_value()["op"]);
+    auto alloc_ins   = ins->inputs().back();
+    auto old_ws_ins  = conv_ins->inputs().at(2);
+
+    Op cb{conv_op};
+    // TODO: Insert ws allocation
+    auto ws = cb.get_workspace(ctx);
+    (void)ws;
+    m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
+}
+#endif
+
+template <class... Strings>
+inline auto precompile_name(Strings... names) // NOLINT
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(ins->name() != "gpu::precompile_op")
+            return false;
+        auto op = from_value<operation>(ins->get_operator().to_value().at("op"));
+        return (contains({names...}, op.name()));
+    });
+}
+
+#if MIGRAPHX_USE_MIOPEN
+struct find_conv_bias
+{
+    context* ctx = nullptr;
+    auto matcher() const
+    {
+        auto relu = match::name(std::unordered_set<std::string>{"gpu::relu"});
+        return conv_bias(match::none_of(match::output(relu)));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        apply_conv_bias<miopen_conv_bias>(*ctx, m, r);
+    }
+};
+
+struct find_conv_bias_relu
+{
+    context* ctx = nullptr;
+    auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        apply_conv_bias<miopen_conv_bias_relu>(*ctx, m, r);
+    }
+};
+struct find_conv_pointwise
+{
+    context* ctx = nullptr;
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::nargs(3),
+            match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
+                                    fusable_conv(match::used_once()).bind("conv")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto conv_ins    = r.instructions["conv"];
+        auto bias_ins    = r.instructions["bias"];
+        auto ins         = r.result;
+        auto input_ins   = conv_ins->inputs().at(0);
+        auto weights_ins = conv_ins->inputs().at(1);
+        auto conv_op     = from_value<op::convolution>(conv_ins->get_operator().to_value()["op"]);
+        auto alloc_ins   = ins->inputs().back();
+
+        module_ref pm = ins->module_inputs().front();
+
+        miopen_fusion op{};
+        op.ops.push_back({{conv_op}});
+        for(auto&& i : *pm)
+        {
+            if(i.name()[0] == '@')
+                continue;
+            op.ops.push_back({{i.get_operator()}});
+        }
+        std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
+        auto v                              = op.compile(*ctx, ins->get_shape(), to_shapes(inputs));
+        if(not v.is_object())
+            return;
+        m.replace_instruction(ins, op, inputs);
+    }
+};
+#endif
+
+#if MIGRAPHX_USE_ROCBLAS or MIGRAPHX_USE_HIPBLASLT
+struct gemm_pointwise
+{
+    // TODO: Move to matcher.hpp
+    static auto match_param(const std::string& name)
+    {
+        return match::make_basic_pred_matcher([=](auto ins) {
+            if(ins->name() != "@param")
+                return false;
+            auto p = any_cast<builtin::param>(ins->get_operator());
+            return p.parameter == name;
+        });
+    }
+
+    template <class M>
+    static auto match_mul_const(M m, const std::string& var)
+    {
+        return match::name("mul")(match::either_arg(0, 1)(match::name("@literal").bind(var), m))
+            .bind(var + "_mul");
+    }
+
+    static auto match_add(const std::string& input, const std::string& output)
+    {
+        auto param     = match::name("@param");
+        auto add       = match::name("add")(match::args(param, param));
+        auto inner_mul = match::any_of(match_mul_const(match_param(input), "alpha"),
+                                       match_mul_const(match_param(output), "beta"));
+        auto mul_add   = match::name("add")(match::either_arg(0, 1)(inner_mul, param));
+        auto add_mul   = match_mul_const(add, "gamma");
+        return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul)));
+    }
+
+    static auto match_mul(const std::string& input)
+    {
+        auto mul = match_mul_const(match_param(input), "alpha");
+        return match::name("@return")(match::args(mul));
+    }
+
+    static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); }
+
+    template <class Gemm>
+    static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input)
+    {
+        auto names = pm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        if(names.size() == 1)
+        {
+            auto mr = match::match_instruction(*pm, std::prev(pm->end()), match_mul(names[input]));
+            if(mr.result == pm->end())
+                return false;
+            gemm.alpha *= get_float(mr.instructions["alpha"]);
+            return true;
+        }
+        else if(names.size() == 2)
+        {
+            unsigned output = input == 0 ? 1 : 0;
+            auto mr         = match::match_instruction(
+                *pm, std::prev(pm->end()), match_add(names[input], names[output]));
+            if(mr.result == pm->end())
+                return false;
+            if(contains(mr.instructions, "alpha_mul"))
+                gemm.alpha *= get_float(mr.instructions["alpha"]);
+            else if(contains(mr.instructions, "beta_mul"))
+                gemm.beta *= get_float(mr.instructions["beta"]);
+            else if(contains(mr.instructions, "gamma_mul"))
+            {
+                gemm.alpha *= get_float(mr.instructions["gamma"]);
+                gemm.beta *= get_float(mr.instructions["gamma"]);
+            }
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+};
+#endif
+
+#if MIGRAPHX_USE_ROCBLAS
+struct find_rocblas_gemm_pointwise : gemm_pointwise
+{
+    auto matcher() const
+    {
+        auto gemm_op   = match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm");
+        auto binary_op = match::all_of(
+            match::nargs(3),
+            match::either_arg(0, 1)(
+                match::any_of(match::standard_shape(), match::is_constant()).bind("c"), gemm_op));
+        auto unary_op = match::all_of(match::nargs(2), match::arg(0)(gemm_op));
+        return precompile_name("pointwise")(match::any_of(binary_op, unary_op));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+
+        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
+
+        // Already fused gemm
+        if(not float_equal(gemm.beta, 0))
+            return;
+        if(ins->inputs().size() == 3)
+            gemm.beta = 1;
+
+        if(not update_gemm(
+               gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
+            return;
+
+        auto inputs = gemm_ins->inputs();
+        inputs.pop_back();
+
+        if(ins->inputs().size() == 3)
+        {
+            auto c_ins = r.instructions["c"];
+            shape s    = c_ins->get_shape();
+            // const-fold input if not standard shape since rocblas can't handle it
+            // Updated for a case where "standard" shape has out-of-sequence strides
+            if(not s.standard())
+            {
+                auto c = make_op("contiguous");
+                auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
+                c_ins  = m.add_literal(l.get_shape(), l.data());
+            }
+            inputs.push_back(c_ins);
+        }
+
+        inputs.push_back(ins->inputs().back());
+
+        m.replace_instruction(ins, gemm, inputs);
+    }
+};
+#endif
+
+#if MIGRAPHX_USE_HIPBLASLT
+struct find_hipblas_gemm_pointwise : gemm_pointwise
+{
+    auto matcher() const
+    {
+        auto gemm_op =
+            match::name("gpu::hipblaslt_op")(match::nargs(3), match::used_once()).bind("hip_gemm");
+        auto binary_op = match::all_of(
+            match::nargs(3),
+            match::either_arg(0, 1)(
+                match::any_of(match::standard_shape(), match::is_constant()).bind("c"), gemm_op));
+        auto unary_op = match::all_of(match::nargs(2), match::arg(0)(gemm_op));
+        return precompile_name("pointwise")(match::any_of(binary_op, unary_op));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["hip_gemm"];
+
+        auto gemm_op = any_cast<hipblaslt_op>(gemm_ins->get_operator()).op;
+
+        if(gemm_op.name() != "gpu::hip_gemm")
+            return;
+
+        auto gemm = any_cast<hip_gemm<op::dot>>(gemm_op);
+
+        // Already fused gemm
+        if(not float_equal(gemm.beta, 0))
+            return;
+        if(ins->inputs().size() == 3)
+            gemm.beta = 1;
+        if(not update_gemm(
+               gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
+        {
+            return;
+        }
+        auto inputs = gemm_ins->inputs();
+        inputs.pop_back();
+        if(ins->inputs().size() == 3)
+        {
+            auto c_ins = r.instructions["c"];
+            shape s    = c_ins->get_shape();
+            // const-fold input if not standard shape
+            // Updated for a case where "standard" shape has out-of-sequence strides
+            if(not s.standard())
+            {
+                auto c = make_op("contiguous");
+                auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
+                c_ins  = m.add_literal(l.get_shape(), l.data());
+            }
+            inputs.push_back(c_ins);
+        }
+        inputs.push_back(ins->inputs().back());
+
+        operation new_gemm_op = gemm;
+        auto new_ins          = m.insert_instruction(
+            ins, make_op("gpu::hipblaslt_op", {{"op", to_value(new_gemm_op)}}), inputs);
+        m.replace_instruction(ins, new_ins);
+    }
+};
+#endif
+
+struct contiguous_transpose_gemm
+{
+    template <class Vector>
+    static bool is_swapped(const Vector& perm, std::size_t i, std::size_t j)
+    {
+        if(i >= perm.size() or j >= perm.size())
+            return false;
+        auto perm2 = perm;
+        std::iota(perm2.begin(), perm2.end(), 0);
+        std::swap(perm2[i], perm2[j]);
+        return perm2 == perm;
+    }
+};
+
+struct find_contiguous_transpose_rocblas_gemm : contiguous_transpose_gemm
+{
+    auto matcher() const
+    {
+        return match::name("gpu::contiguous")(match::arg(0)(
+            match::name("transpose")(
+                match::arg(0)(match::name("gpu::gemm")(match::used_once()).bind("gemm")))
+                .bind("transpose")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins       = r.result;
+        auto gemm      = r.instructions["gemm"];
+        auto alloc     = gemm->inputs().back();
+        auto transpose = r.instructions["transpose"];
+        auto perm      = transpose->get_operator().to_value()["permutation"].to_vector<int64_t>();
+        auto iperm     = invert_permutation(perm);
+
+        if(perm.size() < 3)
+            return;
+
+        if(not is_swapped(perm, perm.size() - 3, perm.size() - 2))
+            return;
+
+        auto lens = gemm->get_shape().lens();
+        if(lens.size() > 3 and
+           not std::all_of(lens.begin(), lens.end() - 3, [](auto i) { return i == 1; }))
+            return;
+
+        auto gemmv           = gemm->get_operator().to_value();
+        gemmv["trans_batch"] = 1;
+
+        auto s = shape{alloc->get_shape().type(), reorder_dims(alloc->get_shape().lens(), iperm)};
+        auto new_alloc = m.insert_instruction(gemm, make_op("allocate", {{"shape", to_value(s)}}));
+        auto alloc_transpose =
+            m.insert_instruction(gemm, make_op("transpose", {{"permutation", perm}}), new_alloc);
+
+        auto inputs        = gemm->inputs();
+        inputs.back()      = alloc_transpose;
+        auto new_gemm      = m.insert_instruction(gemm, make_op("gpu::gemm", gemmv), inputs);
+        auto gemm_transpoe = m.insert_instruction(gemm, transpose->get_operator(), new_gemm);
+
+        m.replace_instruction(ins, gemm_transpoe);
+    }
+};
+
+#if MIGRAPHX_USE_HIPBLASLT
+struct find_contiguous_transpose_hip_gemm : contiguous_transpose_gemm
+{
+    auto matcher() const
+    {
+        return match::name("gpu::contiguous")(match::arg(0)(
+            match::name("transpose")(
+                match::arg(0)(
+                    match::name("gpu::hipblaslt_op")(match::used_once()).bind("hip_gemm")))
+                .bind("transpose")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["hip_gemm"];
+        auto gemm_op  = any_cast<hipblaslt_op>(gemm_ins->get_operator()).op;
+
+        if(gemm_op.name() != "gpu::hip_gemm")
+            return;
+
+        auto gemm = any_cast<hip_gemm<op::dot>>(gemm_op);
+
+        auto alloc     = gemm_ins->inputs().back();
+        auto transpose = r.instructions["transpose"];
+        auto perm      = transpose->get_operator().to_value()["permutation"].to_vector<int64_t>();
+        auto iperm     = invert_permutation(perm);
+
+        if(perm.size() < 3)
+            return;
+
+        if(not is_swapped(perm, perm.size() - 3, perm.size() - 2))
+            return;
+
+        auto lens = gemm_ins->get_shape().lens();
+        if(lens.size() > 3 and
+           not std::all_of(lens.begin(), lens.end() - 3, [](auto i) { return i == 1; }))
+            return;
+
+        gemm.trans_batch = 1;
+
+        auto s = shape{alloc->get_shape().type(), reorder_dims(alloc->get_shape().lens(), iperm)};
+        auto new_alloc =
+            m.insert_instruction(gemm_ins, make_op("allocate", {{"shape", to_value(s)}}));
+
+        auto alloc_transpose = m.insert_instruction(
+            gemm_ins, make_op("transpose", {{"permutation", perm}}), new_alloc);
+
+        auto inputs           = gemm_ins->inputs();
+        inputs.back()         = alloc_transpose;
+        operation new_gemm_op = gemm;
+        auto new_gemm         = m.insert_instruction(
+            gemm_ins, make_op("gpu::hipblaslt_op", {{"op", to_value(new_gemm_op)}}), inputs);
+
+        auto gemm_transpoe = m.insert_instruction(gemm_ins, transpose->get_operator(), new_gemm);
+
+        m.replace_instruction(ins, gemm_transpoe);
+    }
+};
+#endif
+
+struct find_commutative_broadcast
+{
+    auto matcher() const
+    {
+        return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape()));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto args = ins->inputs();
+        move_broadcasted_back(args);
+
+        m.replace_instruction(ins, ins->get_operator(), args);
+    }
+};
+} // namespace
+
+struct find_contiguous
+{
+    auto matcher() const { return match::name("gpu::contiguous"); }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+
+        m.replace_instruction(
+            ins,
+            make_op("gpu::precompile_op", {{"op", to_value(make_op("contiguous"))}}),
+            ins->inputs());
+    }
+};
+
+struct find_contiguous_layout_pointwise
+{
+    auto matcher() const
+    {
+        auto cont_pw   = precompile_name("pointwise")(match::any_of[match::inputs()](
+            match::name("gpu::contiguous")(match::used_once()).bind("layout_ins")));
+        auto layout_pw = precompile_name("pointwise")(match::any_of[match::inputs()](
+            precompile_name("layout")(match::used_once()).bind("layout_ins")));
+        return match::any_of(cont_pw, layout_pw);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto pw_ins        = r.result;
+        auto layout_ins    = r.instructions["layout_ins"];
+        auto layout_input  = layout_ins->inputs().front();
+        auto pw_ins_inputs = pw_ins->inputs();
+        replace(pw_ins_inputs, layout_ins, layout_input);
+        // Ensure the output shape of the pointwise module retains the memory layout
+        auto pw_op_val            = pw_ins->get_operator().to_value();
+        pw_op_val["output_shape"] = to_value(pw_ins->get_shape());
+
+        auto new_ins = m.insert_instruction(
+            pw_ins, make_op(pw_ins->name(), pw_op_val), pw_ins_inputs, pw_ins->module_inputs());
+        m.replace_instruction(pw_ins, new_ins);
+    }
+};
+
+struct find_pointwise_layout_contiguous
+{
+    auto matcher() const
+    {
+        auto is_layout = precompile_name("layout")(
+            match::arg(0)(match::used_once(), precompile_name("pointwise")));
+        auto is_contiguous = match::name("gpu::contiguous")(
+            match::arg(0)(match::used_once(), precompile_name("pointwise")));
+        return match::any_of(is_layout, is_contiguous);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto pw     = ins->inputs().front();
+        auto alloc  = ins->inputs().back();
+        auto args   = pw->inputs();
+        args.back() = alloc;
+
+        // Ensure the output shape of the pointwise module retains the memory layout
+        auto pw_op_val            = pw->get_operator().to_value();
+        pw_op_val["output_shape"] = to_value(ins->get_shape());
+
+        m.replace_instruction(ins, make_op(pw->name(), pw_op_val), args, pw->module_inputs());
+    }
+};
+
+struct find_layernorm_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(match::arg(0)(
+            precompile_name("gpu::prelayernorm", "gpu::preadd_layernorm").bind("layernorm")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto pw_ins    = r.result;
+        auto layernorm = r.instructions["layernorm"];
+        if(not layernorm->module_inputs().empty())
+            return;
+        auto* pm       = pw_ins->module_inputs().front();
+        auto pw_inputs = pw_ins->inputs();
+        auto ln_pos    = std::find(pw_inputs.begin(), pw_inputs.end(), layernorm);
+        assert(ln_pos != pw_inputs.end());
+        pw_inputs.erase(ln_pos);
+        auto inputs = layernorm->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), pw_inputs.begin(), pw_inputs.end());
+
+        // Ensure the output shape retains the memory layout
+        auto layernorm_op_val            = layernorm->get_operator().to_value();
+        layernorm_op_val["output_shape"] = to_value(pw_ins->get_shape());
+
+        m.replace_instruction(pw_ins, make_op(layernorm->name(), layernorm_op_val), inputs, {pm});
+    }
+};
+
+struct find_concat_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::arg(0)(precompile_name("concat").bind("concat")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto concat = r.instructions["concat"];
+        if(not concat->module_inputs().empty())
+            return;
+
+        // TODO: Handle type conversions
+        if(ins->get_shape().type() != concat->get_shape().type())
+            return;
+
+        auto* pm    = ins->module_inputs().front();
+        auto inputs = concat->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+
+        auto op = concat->get_operator();
+        op.from_value({{"additional_args", ins->inputs().size() - 1},
+                       {"ignore_modules", true},
+                       {"output_shape", to_value(ins->get_shape())}});
+
+        m.replace_instruction(ins, op, inputs, {pm});
+    }
+};
+
+void fuse_ops::apply(module& m) const
+{
+    match::find_matches(m, find_pointwise_layout_contiguous{}, find_contiguous_layout_pointwise{});
+    run_passes(m, {dead_code_elimination{}});
+#if MIGRAPHX_USE_MIOPEN
+    match::find_matches(m, find_conv_pointwise{ctx}, find_conv_bias_relu{ctx}, find_conv_bias{ctx});
+    run_passes(m, {dead_code_elimination{}});
+#endif
+#if MIGRAPHX_USE_ROCBLAS
+    match::find_matches(m, find_rocblas_gemm_pointwise{});
+#endif
+#if MIGRAPHX_USE_HIPBLASLT
+    match::find_matches(m, find_hipblas_gemm_pointwise{});
+#endif
+    match::find_matches(m,
+                        find_layernorm_pointwise{},
+                        find_concat_pointwise{},
+                        find_contiguous_transpose_rocblas_gemm{},
+#if MIGRAPHX_USE_HIPBLASLT
+                        find_contiguous_transpose_hip_gemm{},
+#endif
+                        find_commutative_broadcast{});
+    match::find_matches(m, find_contiguous{});
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/gemm_impl.cpp b/docker/rocm/migraphx/targets/gpu/gemm_impl.cpp
new file mode 100644
index 000000000..d0f750a25
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/gemm_impl.cpp
@@ -0,0 +1,708 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <rocblas/internal/rocblas-types.h>
+#include <rocblas/rocblas.h>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/gemm_impl.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/time.hpp>
+#include <type_traits>
+
+using microseconds = std::chrono::duration<double, std::micro>;
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_ROCBLAS
+/*
+Regular rocBLAS API takes compute_type as `rocblas_datatype` enum value v/s "ex3" BETA API takes it
+as `rocblas_computetype` enum value. `rb_compute_type` is faciliator to implictly cast integer enum
+value to required type that can be used inside `common_args` generator.
+*/
+struct rb_compute_type
+{
+    int type = 0;
+    rb_compute_type(rocblas_datatype t) : type(static_cast<int>(t)) {}
+    rb_compute_type(rocblas_computetype t) : type(static_cast<int>(t)) {}
+    operator rocblas_datatype() const { return static_cast<rocblas_datatype>(type); }
+    operator rocblas_computetype() const { return static_cast<rocblas_computetype>(type); }
+};
+
+// Convert rocBLAS datatypes to equivalent Migraphx data types
+rocblas_datatype get_type(shape::type_t type)
+{
+    switch(type)
+    {
+    case shape::double_type: return rocblas_datatype_f64_r;
+    case shape::float_type: return rocblas_datatype_f32_r;
+    case shape::half_type: return rocblas_datatype_f16_r;
+    case shape::int8_type: return rocblas_datatype_i8_r;
+    case shape::uint8_type: return rocblas_datatype_u8_r;
+    case shape::int32_type: return rocblas_datatype_i32_r;
+    case shape::uint32_type: return rocblas_datatype_u32_r;
+    case shape::fp8e4m3fnuz_type: return rocblas_datatype_f8_r;
+    case shape::fp8e5m2fnuz_type: return rocblas_datatype_bf8_r;
+    case shape::fp8e4m3fn_type:
+    case shape::fp8e5m2_type:
+    case shape::tuple_type:
+    case shape::bool_type:
+    case shape::uint16_type:
+    case shape::int16_type:
+    case shape::int64_type:
+    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
+    case shape::bf16_type: return rocblas_datatype_bf16_r;
+    }
+
+    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
+}
+
+void blas_shape(const shape& in_shape)
+{
+    if(in_shape.lens().size() < 2)
+        return;
+    auto s = in_shape.normalize_standard();
+    if(std::none_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 1; }))
+        MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
+    if(std::any_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 0; }))
+        MIGRAPHX_THROW("GPU_GEMM: matrix dimensions can't be broadcasted");
+    if(s.lens().size() < 3)
+        return;
+    shape batch_shape{s.type(),
+                      {s.lens().begin(), s.lens().end() - 2},
+                      {s.strides().begin(), s.strides().end() - 2}};
+    auto batch_shapes = reduce_dims({batch_shape});
+    if(batch_shapes.front().lens().size() != 1)
+        MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
+}
+
+shape transpose_batch(const shape& s, unsigned trans_batch)
+{
+    if(trans_batch == 0)
+        return s;
+    if(s.lens().size() < 3)
+        return s;
+    auto batch = s.lens().size() - 3;
+    std::vector<int64_t> perm(s.lens().size());
+    std::iota(perm.begin(), perm.end(), 0);
+    std::swap(perm[batch], perm[batch + trans_batch]);
+    return shape::from_permutation(s.type(), s.lens(), perm);
+}
+
+/**
+ * Returns results of rocblas_status_success, rocblas_status_perf_degraded,
+ * or rocblas_status_invalid_value.  Caller
+ * is expected to check for invalid index.  Any other result causes an exception.
+ *
+ */
+template <class F, class Pack, class... Ts>
+auto rocblas_invoke(F f, Pack p, Ts... xs)
+{
+    return p([=](auto... ws) {
+        auto status = f(ws..., xs...);
+        if(status != rocblas_status_success and status != rocblas_status_invalid_value)
+        {
+            if(status == rocblas_status_perf_degraded)
+            {
+                std::cerr << "WARNING: degraded perf. in rocBLAS call" << std::endl;
+            }
+            else
+                MIGRAPHX_THROW("rocblas_invoke: rocBLAS call failed with status " +
+                               std::to_string(status));
+        }
+        return status;
+    });
+}
+
+static bool is_transposed(const shape& s)
+{
+    if(s.transposed())
+    {
+        return s.strides().back() != 1;
+    }
+
+    if(not s.broadcasted() and s.strides() != s.as_standard().strides())
+    {
+        auto perm = find_permutation(s);
+        return not std::is_sorted(perm.begin(), perm.end());
+    }
+
+    return false;
+}
+
+static rocblas_int get_batch_stride(const shape& s)
+{
+    // This value is not needed for non-strided inputs
+    if(s.strides().size() < 3)
+        return 0;
+    else
+        return s.strides()[s.strides().size() - 3];
+}
+
+/**
+ * Wrapper for multiple rocBLAS calls.  The constructor creates parameters for
+ * these calls based on data shapes and other values contained in the associated
+ * instruction and operation.
+ *
+ * The template parameter T is not the type of the matrix data but of the weighting
+ * coefficients alpha and beta (these are float in rocBLAS internals)
+ */
+template <typename T>
+struct gemm_impl
+{
+    gemm_impl(const shape& output_shape,
+              const std::vector<shape>& input_shapes,
+              T alpha_param,
+              T beta_param,
+              bool compute_fp32_flag)
+        : alpha(alpha_param),
+          beta(beta_param),
+          is_3inputs(input_shapes.size() == 4),
+          compute_fp32(compute_fp32_flag)
+    {
+        if(not is_3inputs)
+        {
+            beta = 0;
+        }
+
+        // Create lambdas that will cast alpha, beta to the output shape's type
+        // and retain the values being pointed to
+        output_shape.visit_type([&](auto as) {
+            auto alpha_r = as(alpha);
+            auto beta_r  = as(beta);
+            if(compute_fp32)
+            {
+                get_alpha = [=] { return &alpha; };
+                get_beta  = [=] { return &beta; };
+            }
+            else
+            {
+                get_alpha = [=] { return &alpha_r; };
+                get_beta  = [=] { return &beta_r; };
+            }
+        });
+
+        transa     = is_transposed(input_shapes[0]);
+        transb     = is_transposed(input_shapes[1]);
+        auto n_dim = output_shape.lens().size();
+        auto dim_0 = n_dim - 2;
+        auto dim_1 = n_dim - 1;
+        // Leading dimensions of matrices
+        lda = input_shapes[0].strides()[transa ? dim_1 : dim_0];
+        ldb = input_shapes[1].strides()[transb ? dim_1 : dim_0];
+        ldc = input_shapes[2].strides()[dim_0];
+        ldd = is_3inputs ? input_shapes[3].strides()[dim_0] : ldc;
+
+        arg_type    = get_type(input_shapes[0].type());
+        output_type = get_type(input_shapes[2].type());
+        if(output_type == rocblas_datatype_i8_r or output_type == rocblas_datatype_u8_r)
+        {
+            output_type = rocblas_datatype_i32_r;
+        }
+        compute_type = rb_compute_type{output_type};
+        if(compute_fp32)
+        {
+            if(arg_type == rocblas_datatype_f16_r or arg_type == rocblas_datatype_bf16_r)
+                compute_type = rocblas_datatype_f32_r;
+        }
+        if(arg_type == rocblas_datatype_f8_r)
+        {
+            assert(get_type(input_shapes[1].type()) == rocblas_datatype_f8_r);
+            compute_type = rocblas_compute_type_f32;
+        }
+
+        auto a_lens = input_shapes[0].lens();
+        auto b_lens = input_shapes[1].lens();
+
+        auto out_lens = output_shape.lens();
+        m             = out_lens[dim_0];
+        n             = out_lens[dim_1];
+        k             = input_shapes[0].lens()[dim_1];
+
+        a_stride     = get_batch_stride(input_shapes[0]);
+        b_stride     = get_batch_stride(input_shapes[1]);
+        c_stride     = get_batch_stride(input_shapes[2]);
+        d_stride     = is_3inputs ? get_batch_stride(input_shapes[3]) : c_stride;
+        num_matrices = std::accumulate(
+            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
+        strided_batched = num_matrices > 1;
+        if(strided_batched and b_stride == 0 and input_shapes[0].standard())
+        {
+            // If the batch dimension of B is broadcasted, then we can
+            // multiply m by the batch_size and use rocblas_gemm_ex
+            // instead of rocblas_gemm_strided_batched_ex.
+            m *= num_matrices;
+            strided_batched = false;
+        }
+    }
+
+    void run(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx = 0) const
+    {
+#ifdef MIGRAPHX_USE_ROCBLAS_FP8_API
+        if(rocblas_fp8_available() and
+           std::any_of(input_args.begin(), input_args.end(), [](const auto i) {
+               return i.get_shape().type() == migraphx::shape::fp8e4m3fnuz_type;
+           }))
+        {
+            if(strided_batched)
+            {
+                auto common_args =
+                    create_strided_batched_args_common(ctx, compute_type, input_args);
+                rocblas_invoke(&rocblas_gemm_strided_batched_ex3,
+                               common_args,
+                               rocblas_gemm_algo_standard,
+                               solution_idx,
+                               gemm_flags);
+            }
+            else
+            {
+                auto common_args = create_gemm_ex_args_common(ctx, compute_type, input_args);
+                rocblas_invoke(&rocblas_gemm_ex3,
+                               common_args,
+                               rocblas_gemm_algo_standard,
+                               solution_idx,
+                               gemm_flags);
+            }
+        }
+        else
+#endif
+        {
+            if(strided_batched)
+            {
+                auto common_args =
+                    create_strided_batched_args_common(ctx, compute_type, input_args);
+                rocblas_invoke(&rocblas_gemm_strided_batched_ex,
+                               common_args,
+                               rocblas_gemm_algo_solution_index,
+                               solution_idx,
+                               gemm_flags);
+            }
+            else
+            {
+                auto common_args = create_gemm_ex_args_common(ctx, compute_type, input_args);
+                rocblas_invoke(&rocblas_gemm_ex,
+                               common_args,
+                               rocblas_gemm_algo_solution_index,
+                               solution_idx,
+                               gemm_flags);
+            }
+        }
+    }
+
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    auto validate(context& ctx, const std::vector<shape>& input_shapes, int32_t solution_idx) const
+    {
+        // Create dummy arguments for the shapes, and call the overloaded method
+        std::vector<argument> input_args;
+        unsigned long seed = 0;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [&](const shape& x) {
+                           return to_gpu(generate_argument(x, seed++, random_mode::random));
+                       });
+        return validate(ctx, input_args, solution_idx);
+    }
+
+    /**
+     * Checks a particular solution for validity by running it with the flag
+     * rocblas_gemm_flags_check_solution_index (could be invalid if this model was
+     * tuned with a different rocBLAS version)
+     *
+     * @return Returns either solution_idx if valid, or else the default value 0
+     * if not.  The default does not mean list index 0, but tells the picker
+     * to choose a solution.
+     */
+    int32_t
+    validate(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx) const
+    {
+        rocblas_status_ check_valid(rocblas_status_success);
+
+        if(strided_batched)
+        {
+            auto common_args = create_strided_batched_args_common(ctx, compute_type, input_args);
+            check_valid      = rocblas_invoke(&rocblas_gemm_strided_batched_ex,
+                                         common_args,
+                                         rocblas_gemm_algo_solution_index,
+                                         solution_idx,
+                                         rocblas_gemm_flags_check_solution_index);
+        }
+        else
+        {
+            auto common_args = create_gemm_ex_args_common(ctx, compute_type, input_args);
+            check_valid      = rocblas_invoke(&rocblas_gemm_ex,
+                                         common_args,
+                                         rocblas_gemm_algo_solution_index,
+                                         solution_idx,
+                                         rocblas_gemm_flags_check_solution_index);
+        }
+
+        if(check_valid == rocblas_status_invalid_value)
+        {
+            std::cerr << "WARNING:  tuned solution is invalid; reverting to default" << std::endl;
+            return 0;
+        }
+        return solution_idx;
+    }
+#endif
+
+    /**
+     * Helper method to create that subset of a long rocBLAS argument list that is common
+     * to multiple "...strided_batched..." calls.
+     *
+     * The rocblas_gemm API handles inputs and output matrices as
+     *  column-major format. When doing a C = A * B, we actually do
+     *  C^T = (B^T) * (A^T). That is the reason we input args[1] as
+     *   A and args[0] as B in calling the rocblas_gemm.
+     *
+     */
+    auto create_strided_batched_args_common(context& ctx,
+                                            rb_compute_type rbcompute_type,
+                                            const std::vector<argument>& args) const
+    {
+        return pack(ctx.get_stream().get_rocblas(),
+                    transb ? rocblas_operation_transpose : rocblas_operation_none,
+                    transa ? rocblas_operation_transpose : rocblas_operation_none,
+                    n,
+                    m,
+                    k,
+                    get_alpha(),
+                    args[1].data(),
+                    arg_type,
+                    ldb,
+                    b_stride,
+                    args[0].data(),
+                    arg_type,
+                    lda,
+                    a_stride,
+                    get_beta(),
+                    args[2].data(),
+                    output_type,
+                    ldc,
+                    c_stride,
+                    is_3inputs ? args[3].data() : args[2].data(),
+                    output_type,
+                    ldd,
+                    d_stride,
+                    num_matrices,
+                    rbcompute_type);
+    }
+    /**
+     * Helper method to create that subset of a long rocBLAS argument list that is common
+     * to multiple "gemm_ex..." calls.
+     *
+     * The rocblas_gemm API handles inputs and output matrices as
+     *  column-major format. When doing a C = A * B, we actually do
+     *   C^T = (B^T) * (A^T). That is the reason we input args[1] as
+     *   A and args[0] as B in calling the rocblas_gemm.
+     *
+     * */
+    auto create_gemm_ex_args_common(context& ctx,
+                                    rb_compute_type rbcompute_type,
+                                    const std::vector<argument>& args) const
+    {
+        return pack(ctx.get_stream().get_rocblas(),
+                    transb ? rocblas_operation_transpose : rocblas_operation_none,
+                    transa ? rocblas_operation_transpose : rocblas_operation_none,
+                    n,
+                    m,
+                    k,
+                    get_alpha(),
+                    args[1].data(),
+                    arg_type,
+                    ldb,
+                    args[0].data(),
+                    arg_type,
+                    lda,
+                    get_beta(),
+                    args[2].data(),
+                    output_type,
+                    ldc,
+                    is_3inputs ? args[3].data() : args[2].data(),
+                    output_type,
+                    ldd,
+                    rbcompute_type);
+    }
+
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    /**
+     * Find best rocBLAS solution:  Get list of solutions and try them all, returning the index
+     * of the fastest one.
+     */
+    int tune(context& ctx, const std::vector<shape>& input_shapes) const
+    {
+        // tuning meta parameters
+        const int hot_calls = 40;
+        unsigned long seed  = 0;
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [&](const shape& x) {
+                           return to_gpu(generate_argument(x, seed++, random_mode::random));
+                       });
+
+        // Get the solutions list in 2 rocBLAS steps:
+        // 1.  Find out how many solutions there are and allocate the array
+        // 2.  Get the solutions
+        //
+        rocblas_int list_size = 0;
+        std::vector<rocblas_int> solution_indices;
+        rb_compute_type rbcompute_type = compute_type;
+        // rocblas_gemm_get_solutions() API requires compute_type as rocblas_datatype. Convert
+        // manually for FP8
+        if(arg_type == rocblas_datatype_f8_r)
+        {
+            rbcompute_type = rocblas_datatype_f32_r;
+        }
+        if(strided_batched)
+        {
+            auto common_args = create_strided_batched_args_common(ctx, rbcompute_type, input_args);
+            rocblas_invoke(&rocblas_gemm_strided_batched_ex_get_solutions,
+                           common_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           nullptr,
+                           &list_size);
+            solution_indices.resize(list_size);
+
+            auto common_sol_args =
+                create_strided_batched_args_common(ctx, rbcompute_type, input_args);
+            rocblas_invoke(&rocblas_gemm_strided_batched_ex_get_solutions,
+                           common_sol_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           solution_indices.data(),
+                           &list_size);
+        }
+        else
+        {
+            auto common_args = create_gemm_ex_args_common(ctx, rbcompute_type, input_args);
+            rocblas_invoke(&rocblas_gemm_ex_get_solutions,
+                           common_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           nullptr,
+                           &list_size);
+            solution_indices.resize(list_size);
+
+            auto common_sol_args = create_gemm_ex_args_common(ctx, rbcompute_type, input_args);
+            rocblas_invoke(&rocblas_gemm_ex_get_solutions,
+                           common_sol_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           solution_indices.data(),
+                           &list_size);
+        }
+
+        double best_time  = std::numeric_limits<double>::max();
+        double first_time = -1;
+        // Initialize to default solution index
+        rocblas_int best_sol = 0;
+        for(auto sol : solution_indices)
+        {
+            // Warmup: the first call to an op. may not be representative since there is
+            // more time taken initializing caches, etc. so we won't time it.
+            run(ctx, input_args, sol);
+            double host_time = time<milliseconds>([&] {
+                for([[maybe_unused]] int hc : range(hot_calls))
+                    run(ctx, input_args, sol);
+                ctx.finish();
+            });
+
+            host_time /= hot_calls;
+
+            // dev/evaluation only: track time for first solution.
+            if(first_time < 0)
+                first_time = host_time;
+
+            // track current best
+            if(host_time < best_time)
+            {
+                best_sol  = sol;
+                best_time = host_time;
+            }
+        }
+        std::cout << "Winning GEMM solution: " << best_sol << " in " << best_time << " ms, beats "
+                  << first_time << "ms" << std::endl;
+        std::this_thread::sleep_for(std::chrono::milliseconds{50});
+        return best_sol;
+    }
+#endif
+    private:
+    size_t num_matrices = 0;
+    rocblas_int m       = 0;
+    rocblas_int n       = 0;
+    rocblas_int k       = 0;
+    bool transa         = false;
+    bool transb         = false;
+    T alpha             = 0;
+    T beta              = 0;
+
+    std::function<const void*()> get_alpha{};
+    std::function<const void*()> get_beta{};
+    rocblas_gemm_flags gemm_flags = rocblas_gemm_flags_none;
+    rocblas_int lda               = 0;
+    rocblas_int ldb               = 0;
+    rocblas_int ldc               = 0;
+    rocblas_int ldd               = 0;
+    rocblas_int a_stride          = 0;
+    rocblas_int b_stride          = 0;
+    rocblas_int c_stride          = 0;
+    rocblas_int d_stride          = 0;
+    rocblas_datatype arg_type     = rocblas_datatype_f32_r;
+    rb_compute_type compute_type  = rocblas_datatype_f32_r;
+    rocblas_datatype output_type  = rocblas_datatype_f32_r;
+    bool strided_batched          = true;
+    bool is_3inputs               = true;
+    bool compute_fp32             = true;
+}; // gemm_impl
+
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  float alpha,
+                  float beta,
+                  bool compute_fp32,
+                  int32_t solution_idx)
+{
+    std::vector<shape> input_shapes;
+    std::transform(args.begin(),
+                   args.end(),
+                   std::back_inserter(input_shapes),
+                   [](const argument& x) { return x.get_shape().normalize_standard(); });
+    auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
+    gemm_item.run(ctx, args, solution_idx);
+}
+
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  int32_t alpha,
+                  int32_t beta,
+                  bool compute_fp32,
+                  int32_t solution_idx)
+{
+    std::vector<shape> input_shapes;
+    std::transform(args.begin(),
+                   args.end(),
+                   std::back_inserter(input_shapes),
+                   [](const argument& x) { return x.get_shape().normalize_standard(); });
+    auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
+    gemm_item.run(ctx, args, solution_idx);
+}
+
+static value gemm_problem(const shape& output_shape, std::vector<shape> input_shapes)
+{
+    input_shapes.push_back(output_shape);
+    return to_value(input_shapes);
+}
+
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+static void gemm_save_solution(context& ctx,
+                               const shape& output_shape,
+                               const std::vector<shape>& input_shapes,
+                               int32_t solution_idx)
+{
+    ctx.get_problem_cache().insert(
+        "rocblas", gemm_problem(output_shape, input_shapes), solution_idx);
+}
+#endif
+
+int32_t gemm_default_solution(context& ctx,
+                              const shape& output_shape,
+                              const std::vector<shape>& input_shapes)
+{
+    auto sol = ctx.get_problem_cache().get("rocblas", gemm_problem(output_shape, input_shapes));
+    if(sol.has_value())
+        return sol->to<int32_t>();
+    return 0;
+}
+
+/**
+ * Decides if the tune() or validate() method is appropriate and calls it.
+ * Return value is the chosen solution index, or 0 to let picker choose it.
+ */
+template <class T>
+int32_t gemm_finalize_impl(context& ctx,
+                           const shape& output_shape,
+                           const std::vector<shape>& input_shapes,
+                           T alpha,
+                           T beta,
+                           bool compute_fp32,
+                           int32_t solution_idx)
+{
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+
+    // This code should be called only if either the environment var.
+    // MIGRAPHX_ENABLE_GEMM_TUNING, or option --exhaustive-tune, is set
+
+    if(solution_idx == 0)
+    {
+        auto gemm_item = gemm_impl<T>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.tune(ctx, input_shapes);
+        gemm_save_solution(ctx, output_shape, input_shapes, solution_idx);
+    }
+    else
+    {
+        // If a tuned solution index is already given, don't tune again but validate
+        // in case the data was tuned with a different rocBLAS version
+        auto gemm_item = gemm_impl<T>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.validate(ctx, input_shapes, solution_idx);
+    }
+#else
+    (void)ctx, (void)output_shape, (void)input_shapes;
+    (void)alpha, (void)beta, (void)compute_fp32;
+#endif
+    return solution_idx;
+}
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      float alpha,
+                      float beta,
+                      bool compute_fp32,
+                      int32_t solution_idx)
+{
+    return gemm_finalize_impl(
+        ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
+}
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      int32_t alpha,
+                      int32_t beta,
+                      bool compute_fp32,
+                      int32_t solution_idx)
+{
+    return gemm_finalize_impl(
+        ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
+}
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/hip.cpp b/docker/rocm/migraphx/targets/gpu/hip.cpp
new file mode 100644
index 000000000..0fb7deb93
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/hip.cpp
@@ -0,0 +1,330 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#if MIGRAPHX_USE_MIOPEN
+#include <miopen/miopen.h>
+#endif
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_REGISTER_OP(hip_allocate)
+MIGRAPHX_REGISTER_OP(hip_fill)
+MIGRAPHX_REGISTER_OP(hip_sync_stream)
+MIGRAPHX_REGISTER_OP(hip_copy_to_gpu)
+MIGRAPHX_REGISTER_OP(hip_copy_from_gpu)
+MIGRAPHX_REGISTER_OP(hip_copy)
+MIGRAPHX_REGISTER_OP(hip_allocate_memory)
+MIGRAPHX_REGISTER_OP(hip_copy_literal)
+
+using hip_ptr      = MIGRAPHX_MANAGE_PTR(void, hipFree);
+using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister);
+
+std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
+
+bool is_device_ptr(const void* ptr)
+{
+    hipPointerAttribute_t attr;
+    auto status = hipPointerGetAttributes(&attr, ptr);
+    if(status != hipSuccess)
+        return false;
+    return attr.type == hipMemoryTypeDevice;
+}
+
+std::size_t get_available_gpu_memory()
+{
+    size_t free;
+    size_t total;
+    auto status = hipMemGetInfo(&free, &total);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed getting available memory: " + hip_error(status));
+    return free;
+}
+
+void* get_device_ptr(void* hptr)
+{
+    void* result = nullptr;
+    auto status  = hipHostGetDevicePointer(&result, hptr, 0);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed getting device pointer: " + hip_error(status));
+    return result;
+}
+
+struct host_ptr_cache
+{
+    std::unordered_map<void*, std::weak_ptr<void>> cache;
+    std::mutex m;
+    std::shared_ptr<void> get(void* ptr)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        auto it = cache.find(ptr);
+        if(it != cache.end())
+            return it->second.lock();
+        return nullptr;
+    }
+
+    void put(const std::shared_ptr<void>& p)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        cache[p.get()] = p;
+    }
+};
+
+static host_ptr_cache& get_host_ptr_cache()
+{
+    static host_ptr_cache cache;
+    return cache;
+}
+
+std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false)
+{
+    if(sz > get_available_gpu_memory())
+        MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
+    void* alloc_ptr = nullptr;
+    auto status     = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
+    if(status != hipSuccess)
+    {
+        if(host)
+            MIGRAPHX_THROW("Gpu allocation failed: " + hip_error(status));
+        else
+            return allocate_gpu(sz, true);
+    }
+    assert(alloc_ptr != nullptr);
+    std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
+    if(host)
+    {
+        get_host_ptr_cache().put(result);
+    }
+    return result;
+}
+
+std::shared_ptr<void> register_on_gpu(void* ptr, std::size_t sz)
+{
+    std::shared_ptr<void> result = get_host_ptr_cache().get(ptr);
+    if(result)
+    {
+        return result;
+    }
+    auto status = hipHostRegister(ptr, sz, hipHostRegisterMapped);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Gpu register failed: " + hip_error(status));
+    result = share(hip_host_ptr{ptr});
+    get_host_ptr_cache().put(result);
+    return result;
+}
+
+template <class T>
+std::vector<T> read_from_gpu(const void* x, std::size_t sz)
+{
+    gpu_sync();
+    std::vector<T> result(sz);
+    assert(not is_device_ptr(result.data()));
+    if(not is_device_ptr(x))
+    {
+        MIGRAPHX_THROW(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
+    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT
+    return result;
+}
+
+std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false)
+{
+    gpu_sync();
+    auto result = allocate_gpu(sz, host);
+    assert(is_device_ptr(result.get()));
+    assert(not is_device_ptr(x));
+    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Copy to gpu failed: " + hip_error(status));
+    return result;
+}
+
+template <class T>
+hip_ptr write_to_gpu(const T& x)
+{
+    using type = typename T::value_type;
+    auto size  = x.size() * sizeof(type);
+    return write_to_gpu(x.data(), size);
+}
+
+argument allocate_gpu(const shape& s, bool host)
+{
+    auto p = allocate_gpu(s.bytes() + 1, host);
+    return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
+}
+
+argument register_on_gpu(const argument& arg)
+{
+    auto arg_shared = arg.share();
+    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
+    auto s          = arg_shared.get_shape();
+    return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+}
+
+argument to_gpu(const argument& arg, bool host)
+{
+    argument result;
+    arg.visit(
+        [&](auto x) {
+            auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
+            result = {x.get_shape(), p};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return to_gpu(x, host);
+            });
+            result = argument{args};
+        });
+    return result;
+}
+
+argument from_gpu(const argument& arg)
+{
+    argument result;
+    arg.visit(
+        [&](auto x) {
+            using type = typename decltype(x)::value_type;
+            auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
+            // cppcheck-suppress returnDanglingLifetime
+            result = {x.get_shape(), [v]() mutable { return v.data(); }};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return from_gpu(x);
+            });
+            result = argument{args};
+        });
+
+    return result;
+}
+
+void set_device(std::size_t id)
+{
+    auto status = hipSetDevice(id);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Error setting device");
+}
+
+void gpu_sync()
+{
+    auto status = hipDeviceSynchronize();
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("hip device synchronization failed: " + hip_error(status));
+}
+
+void gpu_sync(const context& ctx) { ctx.finish(); }
+
+void hip_async_memset(context& ctx, const argument& dst, int value)
+{
+    std::size_t dst_size = dst.get_shape().bytes();
+    auto status          = hipMemsetAsync(dst.data(), value, dst_size, ctx.get_stream().get());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Gpu fill failed: " + hip_error(status));
+}
+
+void hip_async_copy(context& ctx, const argument& src, const argument& dst, hipMemcpyKind kind)
+{
+    std::size_t src_size = src.get_shape().bytes();
+    std::size_t dst_size = dst.get_shape().bytes();
+    if(src_size > dst_size)
+        MIGRAPHX_THROW("Not enough memory available in destination to do copy");
+    auto status = hipMemcpyAsync(dst.data(), src.data(), src_size, kind, ctx.get_stream().get());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Gpu copy failed: " + hip_error(status));
+}
+
+void gpu_copy(context& ctx, const argument& src, const argument& dst)
+{
+    // Workaround: Use contiguous as hip's memcpy is broken
+    device::contiguous(ctx.get_stream().get(), dst, src);
+    // hip_async_copy(ctx, src, dst, hipMemcpyDeviceToDevice);
+}
+
+void copy_to_gpu(context& ctx, const argument& src, const argument& dst)
+{
+    if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
+    {
+        hip_async_copy(ctx, src, dst, hipMemcpyHostToDevice);
+    }
+    else
+    {
+        gpu_copy(ctx, register_on_gpu(src), dst);
+    }
+}
+
+void copy_from_gpu(context& ctx, const argument& src, const argument& dst)
+{
+    if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
+    {
+        hip_async_copy(ctx, src, dst, hipMemcpyDeviceToHost);
+    }
+    else
+    {
+        gpu_copy(ctx, src, register_on_gpu(dst));
+    }
+}
+
+argument get_preallocation(context& ctx, const std::string& id)
+{
+    return ctx.get_current_device().preallocations.at(id);
+}
+
+void gpu_fill(context& ctx, const argument& dst, int value)
+{
+    if(dst.get_sub_objects().empty())
+    {
+        // TODO: Handle non-packed tensor when value is not 0
+        assert(dst.get_shape().packed() and value == 0);
+        hip_async_memset(ctx, dst, value);
+    }
+    else
+    {
+        for(const auto& arg : dst.get_sub_objects())
+            gpu_fill(ctx, arg, value);
+    }
+}
+
+void store_preallocated_param(context& ctx, const std::string& id, const argument& a)
+{
+    ctx.get_current_device().preallocations[id] = a;
+}
+
+// clang-format off
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/hip_gemm_impl.cpp b/docker/rocm/migraphx/targets/gpu/hip_gemm_impl.cpp
new file mode 100644
index 000000000..966927da7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/hip_gemm_impl.cpp
@@ -0,0 +1,754 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if MIGRAPHX_USE_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#include <limits>
+#include <migraphx/gpu/hipblaslt.hpp>
+#include <migraphx/gpu/hip_gemm_impl.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/time.hpp>
+#include <migraphx/permutation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using microseconds = std::chrono::duration<double, std::micro>;
+
+hipDataType compute_to_hip_type(hipblasComputeType_t type)
+{
+    switch(type)
+    {
+    case HIPBLAS_COMPUTE_32F: return HIP_R_32F;
+    case HIPBLAS_COMPUTE_32I: return HIP_R_32I;
+    case HIPBLAS_COMPUTE_16F:
+    case HIPBLAS_COMPUTE_64F:
+    case HIPBLAS_COMPUTE_32I_PEDANTIC:
+    case HIPBLAS_COMPUTE_16F_PEDANTIC:
+    case HIPBLAS_COMPUTE_32F_PEDANTIC:
+    case HIPBLAS_COMPUTE_64F_PEDANTIC:
+    case HIPBLAS_COMPUTE_32F_FAST_16F:
+    case HIPBLAS_COMPUTE_32F_FAST_16BF:
+    case HIPBLAS_COMPUTE_32F_FAST_TF32:
+        MIGRAPHX_THROW("HIPBLAS_GEMM: conversion from hipComputeType_t to hipDataType failed");
+    }
+}
+
+// Convert hipBLAS datatypes to equivalent MIGraphX data types
+hipDataType get_type_hipblas(shape::type_t type)
+{
+    switch(type)
+    {
+    case shape::double_type: return HIP_R_64F;
+    case shape::float_type: return HIP_R_32F;
+    case shape::half_type: return HIP_R_16F;
+    case shape::int8_type: return HIP_R_8I;
+    case shape::uint8_type: return HIP_R_8U;
+    case shape::int32_type: return HIP_R_32I;
+    case shape::uint32_type: return HIP_R_32U;
+    case shape::fp8e4m3fnuz_type: return HIP_R_8F_E4M3_FNUZ;
+    case shape::fp8e5m2fnuz_type:
+        return HIP_R_8F_E5M2_FNUZ;
+// TODO can remove this preprocessor conditional when hip verison defaults to have these types
+#ifdef ROCM_USE_FLOAT8
+    case shape::fp8e4m3fn_type: return HIP_R_8F_E4M3;
+    case shape::fp8e5m2_type: return HIP_R_8F_E5M2;
+#else
+    case shape::fp8e4m3fn_type:
+    case shape::fp8e5m2_type:
+#endif
+    case shape::tuple_type:
+    case shape::bool_type:
+    case shape::uint16_type:
+    case shape::int16_type:
+    case shape::int64_type:
+    case shape::uint64_type: MIGRAPHX_THROW("HIPBLAS_GEMM: data type not supported!");
+    case shape::bf16_type: return HIP_R_16BF;
+    }
+
+    MIGRAPHX_THROW("HIPBLAS_GEMM: data type not supported!");
+}
+
+void blas_shape_hip(const shape& in_shape)
+{
+    if(in_shape.lens().size() < 2)
+        return;
+    auto s = in_shape.normalize_standard();
+    if(std::none_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 1; }))
+        MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
+    if(std::any_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 0; }))
+        MIGRAPHX_THROW("GPU_GEMM: matrix dimensions can't be broadcasted");
+    if(s.lens().size() < 3)
+        return;
+    shape batch_shape{s.type(),
+                      {s.lens().begin(), s.lens().end() - 2},
+                      {s.strides().begin(), s.strides().end() - 2}};
+    auto batch_shapes = reduce_dims({batch_shape});
+    if(batch_shapes.front().lens().size() != 1)
+        MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
+}
+
+shape transpose_batch_hip(const shape& s, unsigned trans_batch)
+{
+    if(trans_batch == 0)
+        return s;
+    if(s.lens().size() < 3)
+        return s;
+    auto batch = s.lens().size() - 3;
+    std::vector<int64_t> perm(s.lens().size());
+    std::iota(perm.begin(), perm.end(), 0);
+    std::swap(perm[batch], perm[batch + trans_batch]);
+    return shape::from_permutation(s.type(), s.lens(), perm);
+}
+
+static bool is_transposed_hip(const shape& s) { return s.transposed() and s.strides().back() != 1; }
+
+static int32_t get_batch_stride_hip(const shape& s)
+{
+    // This value is not needed for non-strided inputs
+    if(s.strides().size() < 3)
+        return 0;
+    else
+        return s.strides()[s.strides().size() - 3];
+}
+
+/**
+ * Wrapper for multiple hipBLASLt calls.  The constructor creates parameters for
+ * these calls based on data shapes and other values contained in the associated
+ * instruction and operation.
+ */
+struct hip_gemm_impl
+{
+    hip_gemm_impl(const shape& output_shape,
+                  const std::vector<shape>& input_shapes,
+                  float alpha_param,
+                  float beta_param)
+        : alpha(alpha_param), beta(beta_param), is_3inputs(input_shapes.size() == 5)
+    {
+        if(not is_3inputs)
+        {
+            beta = 0;
+        }
+
+        // Create lambdas that will cast alpha, beta to the output shape's type
+        // and retain the values being pointed to
+        output_shape.visit_type([&](auto as) {
+            if(as.is_integral())
+            {
+                int32_t alpha_r = int32_t(alpha);
+                int32_t beta_r  = int32_t(beta);
+                get_alpha       = [=] { return &alpha_r; };
+                get_beta        = [=] { return &beta_r; };
+            }
+            else
+            {
+                get_alpha = [=] { return &alpha; };
+                get_beta  = [=] { return &beta; };
+            }
+        });
+
+        transa = is_transposed_hip(input_shapes[0]);
+        transb = is_transposed_hip(input_shapes[1]);
+        op_a   = transa ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+        op_b   = transb ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+
+        auto n_dim = output_shape.lens().size();
+        auto dim_0 = n_dim - 2;
+        auto dim_1 = n_dim - 1;
+        // Leading dimensions of matrices
+        lda = input_shapes[0].strides()[transa ? dim_1 : dim_0];
+        ldb = input_shapes[1].strides()[transb ? dim_1 : dim_0];
+        ldc = is_3inputs ? input_shapes[2].strides()[dim_0] : input_shapes[3].strides()[dim_0];
+        ldd = is_3inputs ? input_shapes[4].strides()[dim_0] : ldc;
+
+        auto out_lens = output_shape.lens();
+        m             = out_lens[dim_0];
+        n             = out_lens[dim_1];
+        k             = input_shapes[0].lens()[dim_1];
+
+        a_stride     = get_batch_stride_hip(input_shapes[0]);
+        b_stride     = get_batch_stride_hip(input_shapes[1]);
+        c_stride     = is_3inputs ? get_batch_stride_hip(input_shapes[2])
+                                  : get_batch_stride_hip(input_shapes[3]);
+        d_stride     = is_3inputs ? get_batch_stride_hip(input_shapes[4]) : c_stride;
+        num_matrices = std::accumulate(
+            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
+
+        arg_type    = get_type_hipblas(input_shapes[0].type());
+        output_type = is_3inputs ? get_type_hipblas(input_shapes[4].type())
+                                 : get_type_hipblas(input_shapes[3].type());
+
+        if(arg_type == HIP_R_8I or arg_type == HIP_R_8U)
+        {
+            compute_type = HIPBLAS_COMPUTE_32I;
+        }
+        else
+        {
+            compute_type = HIPBLAS_COMPUTE_32F;
+        }
+        if(op_a == HIPBLAS_OP_T)
+        {
+            hipblaslt_invoke(
+                [&]() { return hipblasLtMatrixLayoutCreate(&mat_a, arg_type, m, k, lda); });
+        }
+        else
+        {
+            hipblaslt_invoke(
+                [&]() { return hipblasLtMatrixLayoutCreate(&mat_a, arg_type, k, m, lda); });
+        }
+        if(op_b == HIPBLAS_OP_T)
+        {
+            hipblaslt_invoke(
+                [&]() { return hipblasLtMatrixLayoutCreate(&mat_b, arg_type, k, n, ldb); });
+        }
+        else
+        {
+            hipblaslt_invoke(
+                [&]() { return hipblasLtMatrixLayoutCreate(&mat_b, arg_type, n, k, ldb); });
+        }
+        hipblaslt_invoke(
+            [&]() { return hipblasLtMatrixLayoutCreate(&mat_c, output_type, n, m, ldc); });
+
+        if(is_3inputs)
+        {
+            hipblaslt_invoke(
+                [&]() { return hipblasLtMatrixLayoutCreate(&mat_d, output_type, n, m, ldd); });
+        }
+        if(num_matrices > 1)
+        {
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(mat_a,
+                                                         HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                         &num_matrices,
+                                                         sizeof(num_matrices));
+            });
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(mat_b,
+                                                         HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                         &num_matrices,
+                                                         sizeof(num_matrices));
+            });
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(mat_c,
+                                                         HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                         &num_matrices,
+                                                         sizeof(num_matrices));
+            });
+
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(
+                    mat_a,
+                    HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                    &a_stride,
+                    sizeof(a_stride));
+            });
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(
+                    mat_b,
+                    HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                    &b_stride,
+                    sizeof(b_stride));
+            });
+            hipblaslt_invoke([&]() {
+                return hipblasLtMatrixLayoutSetAttribute(
+                    mat_c,
+                    HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                    &c_stride,
+                    sizeof(c_stride));
+            });
+
+            if(is_3inputs)
+            {
+                hipblaslt_invoke([&]() {
+                    return hipblasLtMatrixLayoutSetAttribute(mat_d,
+                                                             HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                             &num_matrices,
+                                                             sizeof(num_matrices));
+                });
+                hipblaslt_invoke([&]() {
+                    return hipblasLtMatrixLayoutSetAttribute(
+                        mat_d,
+                        HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+                        &d_stride,
+                        sizeof(d_stride));
+                });
+            }
+        }
+        hipblaslt_invoke([&]() {
+            return hipblasLtMatmulDescCreate(
+                &hipblaslt_desc, compute_type, compute_to_hip_type(compute_type));
+        });
+        hipblaslt_invoke([&]() {
+            return hipblasLtMatmulDescSetAttribute(
+                hipblaslt_desc, HIPBLASLT_MATMUL_DESC_TRANSB, &op_a, sizeof(int32_t));
+        });
+        hipblaslt_invoke([&]() {
+            return hipblasLtMatmulDescSetAttribute(
+                hipblaslt_desc, HIPBLASLT_MATMUL_DESC_TRANSA, &op_b, sizeof(int32_t));
+        });
+
+        // Transfer ownership of raw pointers to managed pointers.
+        managed_hipblaslt_desc.reset(hipblaslt_desc);
+        managed_mat_a.reset(mat_a);
+        managed_mat_b.reset(mat_b);
+        managed_mat_c.reset(mat_c);
+        if(is_3inputs)
+        {
+            managed_mat_d.reset(mat_d);
+        }
+    }
+
+    ~hip_gemm_impl() {}
+
+    struct solution
+    {
+        solution() : handle(nullptr), preference(nullptr) {}
+
+        auto get_hipblaslt_preference()
+        {
+            if(hbltpreference == nullptr)
+            {
+                hbltpreference = create_hipblaslt_preference_ptr();
+            }
+            assert(hbltpreference.get() != nullptr);
+            return hbltpreference.get();
+        }
+
+        void init(context& ctx)
+        {
+            if(handle == nullptr)
+            {
+                handle     = ctx.get_stream().get_hipblaslt();
+                preference = get_hipblaslt_preference();
+            }
+        }
+
+        auto& get_result(context& ctx, hip_gemm_impl& gemm, int32_t idx)
+        {
+            init(ctx);
+            if(idx == 0)
+            {
+                // use default solution
+                const int n_sol = 1;
+                int returned_algo_count;
+                heuristic_result.resize(n_sol);
+                uint64_t max_workspace = std::numeric_limits<uint64_t>::max();
+                hipblaslt_invoke([&]() {
+                    return hipblasLtMatmulPreferenceSetAttribute(
+                        preference,
+                        HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                        &max_workspace,
+                        sizeof(uint64_t));
+                });
+                hipblaslt_invoke([&]() {
+                    return hipblasLtMatmulAlgoGetHeuristic(handle,
+                                                           gemm.hipblaslt_desc,
+                                                           gemm.mat_b,
+                                                           gemm.mat_a,
+                                                           gemm.mat_c,
+                                                           gemm.is_3inputs ? gemm.mat_d
+                                                                           : gemm.mat_c,
+                                                           preference,
+                                                           n_sol,
+                                                           heuristic_result.data(),
+                                                           &returned_algo_count);
+                });
+
+                if(returned_algo_count != n_sol)
+                {
+                    std::cout << "less solution found! request: " << n_sol
+                              << ", found: " << returned_algo_count << std::endl;
+                }
+            }
+            else
+            {
+                // query for the solutions. 1st as the best.
+                std::vector<int32_t> algo_index = {idx};
+                hipblaslt_invoke([&]() {
+                    return hipblaslt_ext::getAlgosFromIndex(handle, algo_index, heuristic_result);
+                });
+                assert(heuristic_result.size() == 1);
+            }
+            return heuristic_result;
+        }
+
+        private:
+        hipblasLtHandle_t handle;
+        hipblasLtMatmulPreference_t preference;
+        std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+        shared<hipblaslt_preference_ptr> hbltpreference = nullptr;
+    } solution;
+
+    /**
+     * Helper method to create that subset of a long hipblaslt argument list that is common
+     * to multiple "hipblasLtMatmul" calls.
+     *
+     * The hipblaslt GEMM API handles inputs and output matrices as
+     *  column-major format. When doing a C = A * B, we actually do
+     *   C^T = (B^T) * (A^T). That is the reason we input args[1] as
+     *   A and args[0] as B in calling the hipblaslt GEMM.
+     *
+     * */
+    auto create_hipblaslt_args_common(context& ctx,
+                                      const std::vector<argument>& args,
+                                      int32_t solution_idx)
+    {
+        auto* algo            = &solution.get_result(ctx, *this, solution_idx)[0].algo;
+        size_t workspace_size = ((is_3inputs ? args[3] : args[2]).get_shape()).bytes();
+        return pack(ctx.get_stream().get_hipblaslt(),
+                    hipblaslt_desc,
+                    get_alpha(),                                  // alpha
+                    args[1].data(),                               // A
+                    mat_b,                                        // Adesc
+                    args[0].data(),                               // B
+                    mat_a,                                        // Bdesc
+                    get_beta(),                                   // beta
+                    is_3inputs ? args[2].data() : args[3].data(), // C
+                    mat_c,                                        // Cdesc
+                    is_3inputs ? args[4].data() : args[3].data(), // D
+                    is_3inputs ? mat_d : mat_c,                   // Ddesc
+                    algo,                                         // algo
+                    is_3inputs ? args[3].data() : args[2].data(), // workspace
+                    workspace_size,                               // workspaceSizeInBytes
+                    ctx.get_stream().get()                        // stream
+        );
+    }
+
+    auto create_hipblaslt_supporting_args_common(context& ctx,
+                                                 const std::vector<argument>& args,
+                                                 hipblasLtMatmulAlgo_t& algo,
+                                                 size_t& workspace_size) const
+    {
+        (void)(args);
+        return pack(ctx.get_stream().get_hipblaslt(),
+                    hipblaslt_desc,
+                    get_alpha(),
+                    mat_b,
+                    mat_a,
+                    get_beta(),
+                    mat_c,
+                    is_3inputs ? mat_d : mat_c,
+                    algo,
+                    workspace_size);
+    }
+
+    void
+    run(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx = 0) // const
+    {
+        auto common_args = create_hipblaslt_args_common(ctx, input_args, solution_idx);
+        hipblaslt_invoke(&hipblasLtMatmul, common_args);
+    }
+
+    auto
+    validate(context& ctx, const std::vector<shape>& input_shapes, int32_t solution_idx) // const
+    {
+        // Create dummy arguments for the shapes, and call the overloaded method
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
+
+        return validate(ctx, input_args, solution_idx);
+    }
+
+    /**
+     * Checks a particular solution for validity by running it (could be invalid if this model was
+     * tuned with a different hipBLASLt version)
+     *
+     * @return Returns either solution_idx if valid, or else the default value 0
+     * if not.  The default does not mean list index 0, but tells the picker
+     * to choose a solution.
+     */
+    int32_t
+    validate(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx) // const
+    {
+        auto common_args = create_hipblaslt_args_common(ctx, input_args, solution_idx);
+        auto check_valid = hipblaslt_invoke(&hipblasLtMatmul, common_args, false);
+        if(check_valid != HIPBLAS_STATUS_SUCCESS)
+        {
+            std::cerr << "WARNING:  tuned solution is invalid; reverting to default" << std::endl;
+            return 0;
+        }
+        return solution_idx;
+    }
+
+    /**
+     * Get workspace size for the solution index:  Gets algo from the solution index,
+     * and calls matmulIsAlgoSupported() to get the workspace size.
+     */
+
+    size_t get_workspace_size(context& ctx,
+                              const std::vector<shape>& input_shapes,
+                              int32_t solution_idx) const
+    {
+        size_t workspace_size = hipblaslt_workspace_size;
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
+
+        std::vector<int32_t> algo_index = {solution_idx};
+        std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+
+        hipblaslt_invoke([&]() {
+            return hipblaslt_ext::getAlgosFromIndex(
+                ctx.get_stream().get_hipblaslt(), algo_index, heuristic_result);
+        });
+        assert(heuristic_result.size() == 1);
+
+        auto algo                 = heuristic_result[0].algo;
+        size_t ret_workspace_size = 0;
+        auto supporting_args =
+            create_hipblaslt_supporting_args_common(ctx, input_args, algo, ret_workspace_size);
+
+        auto status =
+            hipblaslt_invoke(&hipblaslt_ext::matmulIsAlgoSupported, supporting_args, false);
+
+        // If algo is supported, update the workspace size to the actual size needed.
+        // Otherwise, use the default workspace size.
+        if(status == HIPBLAS_STATUS_SUCCESS)
+        {
+            // TODO: Remove this check once issues with '0' workspace size are resolved.
+            // Temporarily, we use the approach where, if the returned workspace size is '0',
+            // we use the default workspace size.
+            // Otherwise, we use the returned workspace size.
+            if(ret_workspace_size != 0)
+                workspace_size = ret_workspace_size;
+        }
+        return workspace_size;
+    }
+
+    /**
+     * Find best hipBLASLt solution:  Get list of solutions and try them all, returning the index
+     * of the fastest one.
+     */
+    int tune(context& ctx, const std::vector<shape>& input_shapes) // const
+    {
+        // tuning meta parameters
+        const int hot_calls = 40;
+
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
+
+        std::vector<hipblasLtMatmulHeuristicResult_t> result;
+        hipblaslt_invoke([&]() {
+            return hipblaslt_ext::getAllAlgos(ctx.get_stream().get_hipblaslt(),
+                                              hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+                                              op_a,
+                                              op_b,
+                                              arg_type,
+                                              arg_type,
+                                              output_type,
+                                              output_type,
+                                              compute_type,
+                                              result);
+        });
+        std::vector<int32_t> solution_indices;
+        int returned_algo_count = result.size();
+        for(int i = 0; i < returned_algo_count; i++)
+        {
+            auto algo                 = result[i].algo;
+            size_t ret_workspace_size = 0;
+            auto supporting_args =
+                create_hipblaslt_supporting_args_common(ctx, input_args, algo, ret_workspace_size);
+            try
+            {
+                hipblaslt_invoke(&hipblaslt_ext::matmulIsAlgoSupported, supporting_args);
+                solution_indices.push_back(hipblaslt_ext::getIndexFromAlgo(algo));
+            }
+            catch(...)
+            {
+                // algo is not supported, continue in that case
+                continue;
+            }
+        }
+
+        double best_time  = std::numeric_limits<double>::max();
+        double first_time = -1;
+
+        // Initialize to default solution index
+        int32_t best_sol = 0;
+        // If no valid/supported solution is returned, use hipblasLtMatmulAlgoGetHeuristic
+        // to get an algo and use solution index from that algo.
+        if(solution_indices.empty())
+        {
+            auto algo = solution.get_result(ctx, *this, 0)[0].algo;
+            solution_indices.push_back(hipblaslt_ext::getIndexFromAlgo(algo));
+        }
+        for(auto sol : solution_indices)
+        {
+            // Warmup: the first call to an op. may not be representative since there is
+            // more time taken initializing caches, etc. so we won't time it.
+            run(ctx, input_args, sol);
+            double host_time = time<milliseconds>([&] {
+                for([[maybe_unused]] int hc : range(hot_calls))
+                    run(ctx, input_args, sol);
+                ctx.finish();
+            });
+
+            host_time /= hot_calls;
+
+            // dev/evaluation only: track time for first solution.
+            if(first_time < 0)
+                first_time = host_time;
+
+            // track current best
+            if(host_time < best_time)
+            {
+                best_sol  = sol;
+                best_time = host_time;
+            }
+        }
+
+        std::cout << "Winning GEMM solution: " << best_sol << " in " << best_time << " ms, beats "
+                  << first_time << "ms" << std::endl;
+        return best_sol;
+    }
+
+    // hipblaslt
+    size_t num_matrices = 0;
+    uint64_t m          = 0;
+    uint64_t n          = 0;
+    uint64_t k          = 0;
+    bool transa         = false;
+    bool transb         = false;
+    float alpha         = 0;
+    float beta          = 0;
+    std::function<const void*()> get_alpha{};
+    std::function<const void*()> get_beta{};
+
+    int64_t lda      = 0;
+    int64_t ldb      = 0;
+    int64_t ldc      = 0;
+    int64_t ldd      = 0;
+    int64_t a_stride = 0;
+    int64_t b_stride = 0;
+    int64_t c_stride = 0;
+    int64_t d_stride = 0;
+    bool is_3inputs  = true;
+
+    hipDataType arg_type              = HIP_R_32F;
+    hipblasComputeType_t compute_type = HIPBLAS_COMPUTE_32F;
+    hipDataType output_type           = HIP_R_32F;
+    hipblasLtMatmulDesc_t hipblaslt_desc;
+    hipblasOperation_t op_a;
+    hipblasOperation_t op_b;
+    using hipblaslt_matrix_layout = MIGRAPHX_MANAGE_PTR(hipblasLtMatrixLayout_t,
+                                                        hipblasLtMatrixLayoutDestroy);
+    using hipblaslt_mat_mul_desc  = MIGRAPHX_MANAGE_PTR(hipblasLtMatmulDesc_t,
+                                                       hipblasLtMatmulDescDestroy);
+    hipblaslt_matrix_layout managed_mat_a, managed_mat_b, managed_mat_c, managed_mat_d;
+    hipblaslt_mat_mul_desc managed_hipblaslt_desc;
+    hipblasLtMatrixLayout_t mat_a, mat_b, mat_c, mat_d;
+    hipblasLtHandle_t handle;
+    hipblasLtMatmulPreference_t preference;
+}; // hip_gemm_impl
+
+void hip_gemm_compute(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<argument>& args,
+                      float alpha,
+                      float beta,
+                      int32_t solution_idx)
+{
+    std::vector<shape> input_shapes;
+    std::transform(args.begin(),
+                   args.end(),
+                   std::back_inserter(input_shapes),
+                   [](const argument& x) { return x.get_shape().normalize_standard(); });
+    auto gemm_item = hip_gemm_impl(output_shape, input_shapes, alpha, beta);
+    gemm_item.run(ctx, args, solution_idx);
+}
+
+static value hip_gemm_problem(const shape& output_shape, std::vector<shape> input_shapes)
+{
+    input_shapes.push_back(output_shape);
+    return to_value(input_shapes);
+}
+
+static void hip_gemm_save_solution(context& ctx,
+                                   const shape& output_shape,
+                                   const std::vector<shape>& input_shapes,
+                                   int32_t solution_idx)
+{
+    ctx.get_problem_cache().insert(
+        "hipblaslt", hip_gemm_problem(output_shape, input_shapes), solution_idx);
+}
+
+int32_t hip_gemm_finalize(context& ctx,
+                          const shape& output_shape,
+                          const std::vector<shape>& input_shapes,
+                          float alpha,
+                          float beta,
+                          int32_t solution_idx)
+{
+    auto gemm_item = hip_gemm_impl(output_shape, input_shapes, alpha, beta);
+    if(solution_idx == 0)
+    {
+        solution_idx = gemm_item.tune(ctx, input_shapes);
+        hip_gemm_save_solution(ctx, output_shape, input_shapes, solution_idx);
+    }
+    // If a tuned solution index is already given, don't tune again but validate
+    // in case the data was tuned with a different hipBLASLt version.
+    else
+    {
+        solution_idx = gemm_item.validate(ctx, input_shapes, solution_idx);
+    }
+    return solution_idx;
+}
+
+int32_t hip_gemm_default_solution(context& ctx,
+                                  const shape& output_shape,
+                                  const std::vector<shape>& input_shapes)
+{
+    auto sol =
+        ctx.get_problem_cache().get("hipblaslt", hip_gemm_problem(output_shape, input_shapes));
+    if(sol.has_value())
+        return sol->to<int32_t>();
+    return 0;
+}
+
+size_t hip_gemm_workspace_size(context& ctx,
+                               const shape& output_shape,
+                               const std::vector<shape>& input_shapes,
+                               float alpha,
+                               float beta,
+                               int32_t solution_idx)
+{
+    auto gemm_item = hip_gemm_impl(output_shape, input_shapes, alpha, beta);
+    return gemm_item.get_workspace_size(ctx, input_shapes, solution_idx);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_USE_HIPBLASLT
diff --git a/docker/rocm/migraphx/targets/gpu/hipblaslt.cpp b/docker/rocm/migraphx/targets/gpu/hipblaslt.cpp
new file mode 100644
index 000000000..47a9e9273
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/hipblaslt.cpp
@@ -0,0 +1,69 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/ranges.hpp>
+#include <migraphx/gpu/hipblaslt.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_HIPBLASLT
+// for hipblaslt only
+static const size_t workspace_size = hipblaslt_workspace_size;
+
+hipblaslt_handle_ptr create_hipblaslt_handle_ptr()
+{
+    hipblasLtHandle_t handle;
+    hipblasLtCreate(&handle);
+    return hipblaslt_handle_ptr{handle};
+}
+
+hipblaslt_preference_ptr create_hipblaslt_preference_ptr()
+{
+    hipblasLtMatmulPreference_t preference;
+    hipblasLtMatmulPreferenceCreate(&preference);
+    hipblaslt_invoke([&]() {
+        return hipblasLtMatmulPreferenceSetAttribute(preference,
+                                                     HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                     &workspace_size,
+                                                     sizeof(workspace_size));
+    });
+    return hipblaslt_preference_ptr{preference};
+}
+
+bool hipblaslt_supported()
+{
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    // hipblaslt is supported for MI200 and above, and Navi3x and above.
+    return (device_name == "gfx90a" or
+            (starts_with(device_name, "gfx94") and device_name >= "gfx940") or
+            starts_with(device_name, "gfx110") or starts_with(device_name, "gfx120"));
+}
+
+#endif // MIGRAPHX_USE_HIPBLASLT
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/hiprtc/CMakeLists.txt b/docker/rocm/migraphx/targets/gpu/hiprtc/CMakeLists.txt
new file mode 100644
index 000000000..a8cb3cec0
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/hiprtc/CMakeLists.txt
@@ -0,0 +1,40 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+add_executable(migraphx-hiprtc-driver
+    main.cpp
+)
+rocm_clang_tidy_check(migraphx-hiprtc-driver)
+# On Windows, the driver's default 1MB stack size is not enough - increasing to 4MB.
+set(STACK_SIZE 4194304)
+if(MSVC)
+    target_link_options(migraphx-hiprtc-driver PRIVATE /STACK:${STACK_SIZE})
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC")
+    target_link_options(migraphx-hiprtc-driver PRIVATE -Xlinker /stack:${STACK_SIZE})
+endif()
+target_link_libraries(migraphx-hiprtc-driver PRIVATE migraphx_gpu)
+add_dependencies(migraphx_all_targets migraphx-hiprtc-driver)
+rocm_install_targets(
+    TARGETS migraphx-hiprtc-driver
+)
diff --git a/docker/rocm/migraphx/targets/gpu/hiprtc/main.cpp b/docker/rocm/migraphx/targets/gpu/hiprtc/main.cpp
new file mode 100644
index 000000000..d443ce49e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/hiprtc/main.cpp
@@ -0,0 +1,92 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/serialize.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/msgpack.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/ranges.hpp>
+#include <array>
+#include <iostream>
+#include <cstring>
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+std::vector<char> read_stdin()
+{
+#ifdef _WIN32
+    // Set stream translation mode to BINARY to suppress translations.
+    // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/setmode?view=msvc-170
+    auto old_mode = _setmode(_fileno(stdin), _O_BINARY);
+    if(old_mode == -1)
+        MIGRAPHX_THROW(std::strerror(errno));
+#endif
+    std::vector<char> result;
+    std::array<char, 1024> buffer{};
+    std::size_t len = 0;
+    while((len = std::fread(buffer.data(), 1, buffer.size(), stdin)) > 0)
+    {
+        if(std::ferror(stdin) != 0 and std::feof(stdin) == 0)
+            MIGRAPHX_THROW(std::strerror(errno));
+
+        result.insert(result.end(), buffer.data(), buffer.data() + len);
+    }
+#ifdef _WIN32
+    // Reset to the previously set translation mode.
+    _setmode(_fileno(stdin), old_mode);
+#endif
+    return result;
+}
+
+int main(int argc, char const* argv[])
+{
+    if(argc < 2 or migraphx::contains({"-h", "--help", "-v", "--version"}, std::string(argv[1])))
+    {
+        std::cout << "USAGE:" << std::endl;
+        std::cout << "    ";
+        std::cout << "Used internally by migraphx to compile hip programs out-of-process."
+                  << std::endl;
+        std::exit(0);
+    }
+    std::string output_name = argv[1];
+    try
+    {
+        auto v = migraphx::from_msgpack(read_stdin());
+        std::vector<migraphx::gpu::hiprtc_src_file> srcs;
+        migraphx::from_value(v.at("srcs"), srcs);
+        auto out =
+            migraphx::gpu::compile_hip_src_with_hiprtc(std::move(srcs),
+                                                       v.at("params").to_vector<std::string>(),
+                                                       v.at("arch").to<std::string>());
+        if(not out.empty())
+            migraphx::write_buffer(output_name, out.front());
+    }
+    catch(const std::exception& err)
+    {
+        std::cout << err.what() << std::endl;
+    }
+}
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/abs.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/abs.hpp
new file mode 100644
index 000000000..1a9f4b878
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/abs.hpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_ABS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ABS_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/op/abs.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+#if MIGRAPHX_USE_MIOPEN
+
+struct miopen_abs
+{
+    op::abs op;
+    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::abs"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/allocation_model.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/allocation_model.hpp
new file mode 100644
index 000000000..249901f23
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT gpu_allocation_model
+{
+    std::string name() const;
+    std::string copy() const;
+    operation allocate(const shape& s) const;
+    operation preallocate(const shape& s, const std::string& id) const;
+    bool needs_out_params() const { return true; }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/analyze_streams.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
new file mode 100644
index 000000000..cf3d2ee42
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/analyze_streams.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+MIGRAPHX_GPU_EXPORT std::vector<stream_race> analyze_streams(const module& m);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmax.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmax.hpp
new file mode 100644
index 000000000..e05678fa8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmax.hpp
@@ -0,0 +1,61 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmax
+{
+    op::argmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmin.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmin.hpp
new file mode 100644
index 000000000..071eb525e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/argmin.hpp
@@ -0,0 +1,61 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmin
+{
+    op::argmin op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmin"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/ck.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/ck.hpp
new file mode 100644
index 000000000..18d4dce25
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/ck.hpp
@@ -0,0 +1,165 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_CK_HPP
+#define MIGRAPHX_GUARD_GPU_CK_HPP
+
+#include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/stringutils.hpp>
+#include <string_view>
+
+#include "ck/host/device_gemm_multiple_d.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+#ifndef _WIN32
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);
+#endif
+
+// NOLINTNEXTLINE
+const char* const disable_warning_pragma = R"__migraphx__(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+${content}
+#pragma clang diagnostic pop
+)__migraphx__";
+
+template <class P>
+std::string ck_disable_warnings(P p)
+{
+    return interpolate_string(disable_warning_pragma,
+                              {{"content", std::string{p.data(), p.size()}}});
+}
+
+static std::unordered_map<std::string, std::string> create_ck_header_strings()
+{
+    std::unordered_map<std::string, std::string> result;
+    auto ck_headers = ck::host::GetHeaders();
+
+    std::transform(
+        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) {
+            return std::pair<std::string, std::string>(p.first, ck_disable_warnings(p.second));
+        });
+    return result;
+}
+
+static std::vector<src_file> create_ck_headers()
+{
+    static const auto& header_strings = create_ck_header_strings();
+    std::vector<src_file> srcs;
+    std::transform(header_strings.begin(),
+                   header_strings.end(),
+                   std::back_inserter(srcs),
+                   [&](auto& p) { return src_file{p}; });
+    return srcs;
+}
+
+static inline const std::vector<src_file>& ck_headers()
+{
+    static const auto& headers = create_ck_headers();
+    return headers;
+}
+
+inline bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }
+
+inline ck::host::DataType get_type(const shape& s)
+{
+    if(s.type() == shape::half_type)
+        return ck::host::DataType::Half;
+    else if(s.type() == shape::float_type)
+        return ck::host::DataType::Float;
+    else if(s.type() == shape::int8_type)
+        return ck::host::DataType::Int8;
+    else if(s.type() == shape::int32_type)
+        return ck::host::DataType::Int32;
+    MIGRAPHX_THROW("Unsupported ck type");
+}
+
+inline std::size_t get_batch_count(const shape& s)
+{
+    return std::accumulate(
+        s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+inline void fold_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto batch_count = get_batch_count(s);
+    auto m1          = lens.at(lens.size() - 2);
+    auto m2          = lens.at(lens.size() - 1);
+    if(transposed_matrix(s))
+        s = shape{s.type(), {m1, m2 * batch_count}};
+    else
+        s = shape{s.type(), {m1 * batch_count, m2}};
+}
+
+inline void remove_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto m1 = lens.at(lens.size() - 2);
+    auto m2 = lens.at(lens.size() - 1);
+    s       = shape{s.type(), {m1, m2}};
+}
+
+inline bool standard_batch(const shape& s)
+{
+    if(s.lens().size() < 3)
+        return true;
+    std::vector<std::size_t> lens(s.lens().begin(), s.lens().end() - 2);
+    std::vector<std::size_t> strides(s.strides().begin(), s.strides().end() - 2);
+    auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1);
+    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) {
+        return stride / base;
+    });
+    return shape{s.type(), lens, strides}.standard();
+}
+
+inline bool can_fold_batch(const std::vector<shape>& inputs)
+{
+    const auto& b_shape = inputs[1];
+    if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) {
+           return not standard_batch(input);
+       }))
+        return false;
+    const auto& b_strides = b_shape.strides();
+    return std::all_of(
+        b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; });
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_CK_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/code_object_op.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/code_object_op.hpp
new file mode 100644
index 000000000..818676728
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -0,0 +1,98 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CODE_OBJECT_OP_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CODE_OBJECT_OP_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/gpu/kernel.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct code_object_op
+{
+    value::binary code_object{};
+    std::string symbol_name = "";
+    std::size_t global      = 0;
+    std::size_t local       = 0;
+    std::vector<shape> expected_inputs{};
+    shape output{};
+    std::int64_t output_arg = -1;
+    kernel k{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.code_object, "code_object"),
+                    f(self.symbol_name, "symbol_name"),
+                    f(self.global, "global"),
+                    f(self.local, "local"),
+                    f(self.expected_inputs, "expected_inputs"),
+                    f(self.output, "output"),
+                    f(self.output_arg, "output_arg"));
+    }
+
+    value attributes() const { return {{"group", group()}}; }
+
+    std::string group() const { return "gpu::code_object::" + symbol_name; }
+
+    std::string name() const { return "gpu::code_object"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::int64_t get_output_arg(std::size_t n) const
+    {
+        return output_arg < 0 ? n + output_arg : output_arg;
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return get_output_arg(shapes.size());
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const code_object_op& op)
+    {
+        os << op.name() << "[";
+        os << "code_object=" << op.code_object.size() << ",";
+        os << "symbol_name=" << op.symbol_name << ",";
+        os << "global=" << op.global << ",";
+        os << "local=" << op.local << ",";
+        if(op.output_arg != -1)
+            os << "output_arg=" << op.output_arg << ",";
+        os << "]";
+        return os;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_gen.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_gen.hpp
new file mode 100644
index 000000000..03dad2364
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_gen.hpp
@@ -0,0 +1,121 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/module_ref.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/gpu/export.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct shape;
+struct operation;
+
+namespace gpu {
+
+struct context;
+
+namespace gen {
+
+struct vectorize
+{
+    std::size_t size = 1;
+    std::size_t axis = 0;
+    static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
+    static vectorize elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs);
+    static vectorize elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes);
+    std::string str() const;
+};
+struct preload
+{
+    std::vector<bool> args = {};
+    static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
+    bool is_preloading() const;
+    std::string str() const;
+};
+struct tile
+{
+    enum mode
+    {
+        store,
+        load,
+        none
+    };
+    std::vector<mode> args = {};
+    std::size_t axis       = 0;
+    std::size_t ntiles     = 0;
+    std::size_t block_size = 0;
+    std::vector<std::size_t> inner{};
+    std::vector<std::size_t> outer{};
+    static tile elements(const std::vector<shape>& inputs, std::size_t noutputs);
+    // bool is_preloading() const;
+    std::string str() const;
+};
+
+MIGRAPHX_GPU_EXPORT std::size_t find_fast_axis(const shape& input);
+MIGRAPHX_GPU_EXPORT std::size_t find_fast_axis(const std::vector<shape>& inputs);
+
+std::string make_transformer_args(std::vector<std::string> transformers);
+
+template <class... Ts>
+std::string make_transformer_args(Ts... xs)
+{
+    return make_transformer_args({xs.str()...});
+}
+
+std::string
+generate_pointwise(const module& pm, const std::string& name, bool always_return_tuple = false);
+
+std::string generate_reduce(module m, const std::string& name);
+
+std::string generate_name_from_ops(const module& m, const std::string& postname = "");
+
+struct reduce_op
+{
+    std::vector<std::string> inputs = {};
+    std::string reduction = "";
+    std::string init      = "0";
+    std::string read      = "op::id{}";
+    std::string write     = "op::id{}";
+
+    void set(instruction_ref ins, const operation& op);
+    void set(const std::string& name, const shape& input, const shape& output);
+    std::string str() const;
+    static std::string generate(instruction_ref ins, const std::vector<std::string>& x);
+};
+
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip.hpp
new file mode 100644
index 000000000..d2fa4bcb6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -0,0 +1,75 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/functional.hpp>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+#ifdef MIGRAPHX_USE_HIPRTC
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+#endif
+
+struct hiprtc_src_file
+{
+    hiprtc_src_file() = default;
+    hiprtc_src_file(const src_file& s) : path(s.path.string()), content(s.content) {}
+    std::string path;
+    std::string content;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.path, "path"), f(self.content, "content"));
+    }
+};
+
+MIGRAPHX_GPU_EXPORT bool hip_has_flags(const std::vector<std::string>& flags);
+
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>>
+compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
+                            const std::vector<std::string>& params,
+                            const std::string& arch);
+
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>>
+compile_hip_src(const std::vector<src_file>& srcs,
+                const std::vector<std::string>& params,
+                const std::string& arch);
+
+MIGRAPHX_GPU_EXPORT std::string enum_params(std::size_t count, std::string param);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
new file mode 100644
index 000000000..60b8f20a8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -0,0 +1,94 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/compile_src.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_compile_options
+{
+    std::size_t global;
+    std::size_t local;
+    std::vector<shape> inputs;
+    shape output;
+    std::string kernel_name                    = "kernel";
+    std::vector<std::string> params            = {};
+    std::vector<shape> virtual_inputs          = {};
+    std::vector<src_file> additional_src_files = {};
+    std::int64_t output_arg                    = -1;
+
+    /**
+     * @brief Set the launch parameters but allow v to override the values
+     *
+     * @param v A value class which can have a "global" and/or "local" keys to override the default
+     * global and local
+     * @param compute_global A function used to compute the global based on the local
+     * @param default_local The defaul local to use if its missing from the v parameter
+     */
+    void set_launch_params(const value& v,
+                           const std::function<std::size_t(std::size_t local)>& compute_global,
+                           std::size_t default_local = 1024);
+
+    void
+    set_launch_params(const value& v, std::size_t default_global, std::size_t default_local = 1024)
+    {
+        set_launch_params(
+            v, [=](auto) { return default_global; }, default_local);
+    }
+
+    void emplace_param(std::string_view s) { params.emplace_back(s); }
+};
+
+/// Compute global for n elements, but max out on target-specific upper limit
+MIGRAPHX_GPU_EXPORT std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
+
+MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(context& ctx,
+                                                      const std::string& content,
+                                                      hip_compile_options options);
+
+MIGRAPHX_GPU_EXPORT std::size_t
+compute_block_size(context& ctx, std::size_t n, std::size_t max_block_size = 1024);
+
+template <class T>
+std::string generate_index_ints(const std::vector<T>& v)
+{
+    return "index_ints<" + to_string_range(v) + ">{}";
+}
+
+MIGRAPHX_GPU_EXPORT std::string generate_make_shape(const shape& s);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hipblaslt.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hipblaslt.hpp
new file mode 100644
index 000000000..380fafa44
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_hipblaslt.hpp
@@ -0,0 +1,77 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/register_op.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+struct context;
+struct operation;
+
+namespace gpu {
+
+struct hipblaslt_op
+{
+    operation op = op::identity{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::hipblaslt_op"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.push_back(inputs.back());
+        return op.compute_shape(inputs);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(hipblaslt_op);
+
+struct compile_hipblaslt
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_hipblaslt"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_miopen.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
new file mode 100644
index 000000000..03dd669e5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+struct context;
+struct operation;
+
+namespace gpu {
+
+struct compile_miopen
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_miopen"; }
+    void apply(module& m) const;
+    std::size_t compile(operation& op, instruction_ref ins) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_ops.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_ops.hpp
new file mode 100644
index 000000000..6986822a5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_ops.hpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+struct context;
+
+struct MIGRAPHX_GPU_EXPORT compile_ops
+{
+    context* ctx         = nullptr;
+    bool exhaustive_tune = false;
+    std::string name() const { return "gpu::compile_ops"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
new file mode 100644
index 000000000..8e6dc229a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/module_ref.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+operation
+compile_pointwise(context& ctx, const std::vector<migraphx::shape>& in_shapes, const_module_ref pm);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compiler.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compiler.hpp
new file mode 100644
index 000000000..30f927051
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -0,0 +1,201 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILER_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/auto_register.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/optional.hpp>
+#include <migraphx/rank.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
+#include <functional>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct compiler_replace
+{
+    compiler_replace() = default;
+
+    compiler_replace(const operation& op) : code_objects{{op}} {}
+
+    template <class F>
+    compiler_replace(const operation& op, F f) : code_objects{{op}}, replace_fn(make_replace(f))
+    {
+    }
+
+    template <class F, class Trace>
+    compiler_replace(const operation& op, F f, Trace t)
+        : code_objects{{op}}, replace_fn(make_replace(f)), trace_fn(t)
+    {
+    }
+
+    template <class F>
+    compiler_replace(const std::vector<operation>& op, F f)
+        : code_objects{op}, replace_fn(make_replace_all(f))
+    {
+    }
+
+    template <class F, class Trace>
+    compiler_replace(const std::vector<operation>& op, F f, Trace t)
+        : code_objects{op}, replace_fn(make_replace_all(f)), trace_fn(t)
+    {
+    }
+
+    std::vector<operation> code_objects = {};
+    std::function<void(const compiler_replace& cr, module& m, instruction_ref ins)> replace_fn =
+        nullptr;
+    std::function<void(std::ostream& os, instruction_ref ins)> trace_fn = nullptr;
+
+    template <class F>
+    static auto make_replace(F f)
+    {
+        return [=](const compiler_replace& cr, module& m, instruction_ref ins) {
+            f(m, ins, cr.code_objects.front());
+        };
+    }
+
+    template <class F>
+    static auto make_replace_all(F f)
+    {
+        return [=](const compiler_replace& cr, module& m, instruction_ref ins) {
+            f(m, ins, cr.code_objects);
+        };
+    }
+
+    void replace(module& m, instruction_ref ins) const
+    {
+        if(replace_fn)
+            replace_fn(*this, m, ins);
+        else
+        {
+            if(code_objects.size() != 1)
+            {
+                MIGRAPHX_THROW("Provide custom replace function to insert multiple code objects\n");
+            }
+            m.replace_instruction(ins, code_objects.front(), ins->inputs());
+        }
+    }
+
+    void trace(std::ostream& os, instruction_ref ins) const
+    {
+        if(trace_fn)
+            trace_fn(os, ins);
+    }
+};
+
+using compiler_compile =
+    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
+using compiler_compile_op =
+    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
+using compiler_tuning_config =
+    std::function<optional<tuning_config>(context&, instruction_ref, const operation&, bool)>;
+
+MIGRAPHX_GPU_EXPORT void register_compiler(const std::string& name,
+                                           compiler_compile c,
+                                           compiler_compile_op cop,
+                                           compiler_tuning_config ctg);
+
+MIGRAPHX_GPU_EXPORT bool has_compiler_for(const std::string& name);
+MIGRAPHX_GPU_EXPORT compiler_replace compile(context& ctx,
+                                             instruction_ref ins,
+                                             const operation& op,
+                                             const value& solution);
+MIGRAPHX_GPU_EXPORT operation compile_op(const std::string& name,
+                                         context& ctx,
+                                         const std::vector<shape>& inputs,
+                                         const value& v);
+MIGRAPHX_GPU_EXPORT optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive);
+
+template <class T>
+void register_compiler()
+{
+    T c;
+    for(auto&& name : c.names())
+    {
+        register_compiler(
+            name,
+            [=](auto&&... xs) {
+                return c.invoke_compile(rank<1>{}, std::forward<decltype(xs)>(xs)...);
+            },
+            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) { return c.get_tuning_config(std::forward<decltype(xs)>(xs)...); });
+    }
+}
+
+struct register_compiler_action
+{
+    template <class T>
+    static void apply()
+    {
+        register_compiler<T>();
+    }
+};
+
+template <class T>
+using auto_register_compiler = auto_register<register_compiler_action, T>;
+
+template <class Derived>
+struct compiler : auto_register_compiler<Derived>
+{
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
+    optional<tuning_config>
+    get_tuning_config(context&, instruction_ref, const operation&, bool) const
+    {
+        return nullopt;
+    }
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<1>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op), solution))
+    {
+        return derived().compile(ctx, ins, std::move(op), solution);
+    }
+
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<0>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op)))
+    {
+        assert(solution.empty());
+        (void)solution;
+        return derived().compile(ctx, ins, std::move(op));
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_GPU_COMPILER_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/concat_gpu_opt.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/concat_gpu_opt.hpp
new file mode 100644
index 000000000..d5d2f1197
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/concat_gpu_opt.hpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP
+
+#include <migraphx/op/concat.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/serialize.hpp>
+
+namespace migraphx {
+namespace gpu {
+
+struct concat_gpu_optimization
+{
+    std::string allocate() const { return "hip::allocate"; }
+    optional<migraphx::op::concat> get_concat(const migraphx::operation& op) const
+    {
+        if(op.name() != "gpu::precompile_op")
+            return nullopt;
+        auto r = from_value<operation>(op.to_value().at("op"));
+        if(r.name() == "concat")
+            return any_cast<migraphx::op::concat>(r);
+        return nullopt;
+    }
+};
+
+} // namespace gpu
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/config.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/config.hpp
new file mode 100644
index 000000000..cd8c6702b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/config.hpp
@@ -0,0 +1,31 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MIGRAPHX_GUARD_GPU_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/export.h>
+
+#endif // MIGRAPHX_GUARD_GPU_CONFIG_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/context.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/context.hpp
new file mode 100644
index 000000000..7a1a7d34b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/context.hpp
@@ -0,0 +1,399 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+
+#include <migraphx/gpu/export.h>
+#include <migraphx/context.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#if !MIGRAPHX_USE_MIOPEN
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <migraphx/manage_ptr.hpp>
+#endif
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/hipblaslt.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/problem_cache.hpp>
+#include <unordered_map>
+#include <memory>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NULL_STREAM)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_NSTREAMS)
+
+using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
+
+struct hip_device
+{
+    hip_device() : device_props{} { add_stream(); }
+
+    hip_device(std::size_t id, std::size_t n) : device_id(id)
+    {
+        auto status = hipGetDeviceProperties(&device_props, device_id);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to allocate stream");
+
+        for(std::size_t i = 0; i < n; i++)
+            add_stream();
+    }
+
+    struct stream
+    {
+        using hip_stream_ptr = MIGRAPHX_MANAGE_PTR(hipStream_t, hipStreamDestroy);
+
+        stream() {}
+
+        stream(std::size_t device_number) : id(device_number) {}
+
+        void setup() const { set_device(id); }
+
+        static hip_stream_ptr create_stream()
+        {
+            hipStream_t result = nullptr;
+            auto status        = hipStreamCreateWithFlags(&result, hipStreamNonBlocking);
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to allocate stream");
+            return hip_stream_ptr{result};
+        }
+
+        hipStream_t get()
+        {
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
+            {
+                setup();
+                if(s == nullptr)
+                    s = create_stream();
+                assert(s.get() != nullptr);
+                return s.get();
+            }
+            return nullptr;
+        }
+
+#if MIGRAPHX_USE_MIOPEN
+        auto create_miopen_handle()
+        {
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
+                return make_obj<miopen_handle>(&miopenCreateWithStream, get());
+            else
+                return make_obj<miopen_handle>(&miopenCreate);
+        }
+
+        auto get_miopen()
+        {
+            setup();
+            if(mihandle == nullptr)
+                mihandle = create_miopen_handle();
+            assert(mihandle.get() != nullptr);
+            return mihandle.get();
+        }
+#endif
+
+#if MIGRAPHX_USE_ROCBLAS
+        auto get_rocblas()
+        {
+            setup();
+            if(rbhandle == nullptr)
+                rbhandle = create_rocblas_handle_ptr(get());
+            assert(rbhandle.get() != nullptr);
+            return rbhandle.get();
+        }
+#endif
+
+#if MIGRAPHX_USE_HIPBLASLT
+        auto get_hipblaslt()
+        {
+            setup();
+            if(hblthandle == nullptr)
+            {
+                hblthandle = create_hipblaslt_handle_ptr();
+            }
+            assert(hblthandle.get() != nullptr);
+            return hblthandle.get();
+        }
+#endif
+
+        void wait() const
+        {
+            if(s == nullptr)
+                return;
+            setup();
+            auto status = hipStreamSynchronize(s.get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to wait.");
+        }
+
+        void wait(hipEvent_t event)
+        {
+            setup();
+            auto status = hipStreamWaitEvent(get(), event, 0);
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to wait.");
+        }
+
+        void record(hipEvent_t event)
+        {
+            setup();
+            auto status = hipEventRecord(event, get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to record.");
+        }
+
+        private:
+        std::size_t id           = 0;
+        shared<hip_stream_ptr> s = nullptr;
+#if MIGRAPHX_USE_MIOPEN
+        shared<miopen_handle> mihandle = nullptr;
+#endif
+#if MIGRAPHX_USE_ROCBLAS
+        shared<rocblas_handle_ptr> rbhandle = nullptr;
+#endif
+
+#if MIGRAPHX_USE_HIPBLASLT
+        shared<hipblaslt_handle_ptr> hblthandle = nullptr;
+#endif
+    };
+
+    void add_stream() { streams.emplace_back(device_id); }
+
+    stream& get_stream() { return streams.at(current_stream); }
+
+    stream& get_stream(std::size_t n) { return streams.at(n); }
+
+    const stream& get_stream() const { return streams.at(current_stream); }
+
+    const stream& get_stream(std::size_t n) const { return streams.at(n); }
+
+    void set_stream(std::size_t n) { current_stream = n; }
+
+    std::size_t nstreams() const { return streams.size(); }
+
+    std::size_t stream_id() const { return current_stream; }
+
+    std::string get_device_name() const { return device_props.gcnArchName; }
+
+    std::string get_gfx_name() const { return trim(split_string(get_device_name(), ':').front()); }
+
+    std::size_t get_device_major() const { return device_props.major; }
+
+    std::size_t get_device_minor() const { return device_props.minor; }
+
+    std::size_t get_cu_count() const { return device_props.multiProcessorCount; }
+
+    std::size_t get_max_workitems_per_cu() const
+    {
+        return device_props.maxThreadsPerMultiProcessor;
+    }
+
+    std::size_t get_max_workitems_per_block() const { return device_props.maxThreadsPerBlock; }
+
+    std::size_t get_wavefront_size() const { return device_props.warpSize; }
+
+    private:
+    std::size_t device_id      = 0;
+    std::size_t current_stream = 0;
+    std::vector<stream> streams;
+    hipDeviceProp_t device_props;
+
+    public:
+    std::unordered_map<std::string, argument> preallocations{};
+};
+
+struct context
+{
+    struct auto_save_problem_cache : problem_cache
+    {
+        auto_save_problem_cache() : problem_cache{} {}
+
+        bool auto_save = false;
+
+        auto_save_problem_cache(const auto_save_problem_cache&)            = delete;
+        auto_save_problem_cache& operator=(const auto_save_problem_cache&) = delete;
+        virtual ~auto_save_problem_cache()
+        {
+            if(auto_save)
+                this->save();
+        }
+    };
+    context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1))
+        : current_device(std::make_shared<hip_device>(device_id, n)),
+          begin_event(create_event()),
+          finish_event(create_event()),
+          pc(std::make_shared<auto_save_problem_cache>())
+    {
+    }
+
+    hip_device& get_current_device()
+    {
+        assert(current_device != nullptr);
+        return *current_device;
+    }
+
+    const hip_device& get_current_device() const
+    {
+        assert(current_device != nullptr);
+        return *current_device;
+    }
+
+    bool get_exhaustive_tune_flag() const { return exhaustive_tune; }
+
+    void set_exhaustive_tune_flag(bool t) { exhaustive_tune = t; }
+
+    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
+    hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
+
+    const hip_device::stream& get_stream() const { return get_current_device().get_stream(); }
+    const hip_device::stream& get_stream(std::size_t n) const
+    {
+        return get_current_device().get_stream(n);
+    }
+
+    void set_stream(std::size_t n) { get_current_device().set_stream(n); }
+
+    void create_events(std::size_t num_of_events)
+    {
+        for(std::size_t i = events.size(); i < num_of_events + 1; ++i)
+            events.emplace_back(create_event());
+    }
+
+    hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }
+
+    std::vector<argument> literals{};
+    void finish() const { get_stream().wait(); }
+
+    static hip_event_ptr create_event()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
+
+    static hip_event_ptr create_event_for_timing()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreate(&event);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
+
+    value to_value() const
+    {
+        value result;
+        result["events"]  = events.size();
+        result["streams"] = current_device->nstreams();
+
+        return result;
+    }
+
+    void from_value(const value& v)
+    {
+        auto v_events        = v.at("events");
+        std::size_t n_events = v_events.without_key().to<std::size_t>();
+        this->create_events(n_events - 1);
+
+        auto v_streams        = v.at("streams");
+        std::size_t n_streams = v_streams.without_key().to<std::size_t>();
+
+        auto device          = get_device_id();
+        this->current_device = std::make_shared<hip_device>(device, n_streams);
+    }
+
+    void wait_for(any_ptr queue)
+    {
+        auto status = hipEventRecord(begin_event.get(), queue.get<hipStream_t>());
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("failed to record " + hip_error(status));
+
+        get_stream().wait(begin_event.get());
+    }
+
+    void finish_on(any_ptr queue)
+    {
+        get_stream().record(finish_event.get());
+
+        auto status = hipStreamWaitEvent(queue.get<hipStream_t>(), finish_event.get(), 0);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to wait on event " + hip_error(status));
+    }
+
+    any_ptr get_queue() { return get_stream().get(); }
+
+    std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
+    {
+        if(measure_perf)
+            return std::make_pair(start_event.get(), stop_event.get());
+        return std::make_pair(nullptr, nullptr);
+    }
+
+    static float get_elapsed_ms(hipEvent_t start, hipEvent_t stop)
+    {
+        float result = 0;
+        if(start != nullptr and stop != nullptr)
+        {
+            auto status = hipEventElapsedTime(&result, start, stop);
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
+        }
+        return result;
+    }
+
+    problem_cache& get_problem_cache() { return *pc; }
+    void load_problem_cache()
+    {
+        pc->load();
+        pc->auto_save = true;
+    }
+
+    private:
+    // TODO: Make this a vector to support multiple devices
+    std::shared_ptr<hip_device> current_device;
+    std::vector<shared<hip_event_ptr>> events;
+    bool exhaustive_tune = false;
+    bool measure_perf    = false;
+    // for event perf timing
+    shared<hip_event_ptr> start_event = nullptr;
+    shared<hip_event_ptr> stop_event  = nullptr;
+    // for stream syncronization
+    shared<hip_event_ptr> begin_event  = nullptr;
+    shared<hip_event_ptr> finish_event = nullptr;
+    std::shared_ptr<auto_save_problem_cache> pc = nullptr;
+};
+
+inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
+inline void migraphx_from_value(const value& v, context& ctx) { ctx.from_value(v); }
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/contiguous.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/contiguous.hpp
new file mode 100644
index 000000000..638f4571a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONTIGUOUS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONTIGUOUS_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/contiguous.hpp>
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct miopen_contiguous : unary_device<miopen_contiguous, &device::contiguous>
+{
+    std::string name() const { return "gpu::contiguous"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto lens = inputs.at(0).lens();
+        auto t    = inputs.at(0).type();
+        return {t, lens};
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/convolution.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/convolution.hpp
new file mode 100644
index 000000000..1a6d1bc24
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -0,0 +1,352 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
+#include <unordered_map>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+inline shape reshape_if_1d(const shape& input)
+{
+    shape new_shape{input};
+    auto dims = new_shape.lens();
+
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        new_shape = shape{input.type(), new_dims};
+    }
+    return new_shape;
+}
+#if MIGRAPHX_USE_MIOPEN
+template <class Op>
+struct miopen_convolution
+{
+    Op op;
+    shared<convolution_descriptor> cd = nullptr;
+    miopenConvFwdAlgorithm_t algo{};
+#ifdef MIGRAPHX_HAS_FIND_2_API
+    value::binary solution_object{};
+    shared<miopen_solution> solution_ptr = nullptr;
+#endif
+    uint64_t solution_id = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"),
+#ifdef MIGRAPHX_HAS_FIND_2_API
+                    f(self.solution_object, "solution_object"),
+#endif
+                    f(self.algo, "algo"),
+                    f(self.solution_id, "solution_id"));
+    }
+
+    std::string name() const { return "gpu::" + op.name(); }
+
+    inline shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, op}.has(4);
+        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
+        check_shapes{conv_inputs, *this}
+            .max_ndims(5)
+            .packed_layouts({{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}})
+            .same_layout();
+        return migraphx::compute_shape<Op>(op, conv_inputs);
+    }
+
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
+        auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        auto workspace_size        = args[2].get_shape().bytes();
+
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+                {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+                {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+            };
+
+            if(solution_ptr.get() == nullptr)
+                MIGRAPHX_THROW("MIOpen " + op.name() + " : Load MIOpen Solution before running it");
+
+            auto status = miopenRunSolution(miopen_stream_handle,
+                                            solution_ptr.get(),
+                                            3,
+                                            tensor_args,
+                                            args[2].implicit(),
+                                            workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen " + op.name() +
+                               " : running convolution using find_2.0 failed");
+
+            return args[3];
+        }
+#else
+        // else use immediate mode
+        if(solution_id == 0)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : invalid solution ID");
+
+        auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
+                                                        w_desc.get(),
+                                                        args[1].implicit(),
+                                                        x_desc.get(),
+                                                        args[0].implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        args[3].implicit(),
+                                                        args[2].implicit(),
+                                                        workspace_size,
+                                                        solution_id);
+
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": running convolution failed");
+        return args[3];
+#endif
+    }
+
+    void set_conv_descriptor()
+    {
+        cd =
+            (op.name() == "convolution_backwards") ? make_convolution_backwards(op) : make_conv(op);
+    }
+
+    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        set_conv_descriptor();
+        auto ws = find(any_cast<migraphx::gpu::context>(ctx), output, input);
+        return {{"workspace", ws.bytes()}};
+    }
+
+    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        std::size_t workspace_size = 0;
+        auto status                = miopenConvolutionForwardGetWorkSpaceSize(miopen_stream_handle,
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+
+        const auto& x_shape = inputs[0];
+        const auto& w_shape = inputs[1];
+
+        unsigned long seed = 0;
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            auto conv_problem = make_obj<miopen_problem>(
+                &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+
+            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            bool preallocate = false;
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
+            preallocate = true;
+#endif
+            auto x = preallocate ? to_gpu(generate_argument(x_shape, seed++, random_mode::random))
+                                 : argument{inputs[0]};
+            auto w = preallocate ? to_gpu(generate_argument(w_shape, seed++, random_mode::random))
+                                 : argument{inputs[1]};
+            auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
+            auto workspace =
+                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
+
+            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, x.implicit()},
+                {miopenTensorConvolutionW, nullptr, w.implicit()},
+                {miopenTensorConvolutionY, nullptr, y.implicit()},
+            };
+
+            solution_ptr = find_solution(miopen_stream_handle,
+                                         3,
+                                         tensor_args,
+                                         workspace.implicit(),
+                                         workspace_size,
+                                         conv_problem.get(),
+                                         ctx.get_exhaustive_tune_flag());
+
+            status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
+
+            std::size_t solution_size;
+            status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Failed to fetch solution size");
+
+            auto solution_binary = std::vector<char>{};
+            solution_binary.resize(solution_size);
+
+            status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Saving solution failed");
+            solution_object = value::binary{solution_binary.data(), solution_size};
+            return shape{shape::int8_type, {workspace_size}};
+        }
+#else
+        auto x         = to_gpu(generate_argument(x_shape, seed++, random_mode::random));
+        auto w         = to_gpu(generate_argument(w_shape, seed++, random_mode::random));
+        auto y         = allocate_gpu(output_shape);
+        auto workspace = allocate_gpu(workspace_shape);
+        int algo_count = 1;
+        miopenConvAlgoPerf_t perf;
+        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                       x_desc.get(),
+                                                       x.implicit(),
+                                                       w_desc.get(),
+                                                       w.implicit(),
+                                                       cd.get(),
+                                                       y_desc.get(),
+                                                       y.implicit(),
+                                                       1,
+                                                       &algo_count,
+                                                       &perf,
+                                                       workspace.implicit(),
+                                                       workspace_size,
+                                                       ctx.get_exhaustive_tune_flag());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
+        algo = perf.fwd_algo;
+        size_t solution_count;
+
+        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          &solution_count);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution count failed");
+
+        std::vector<miopenConvSolution_t> solutions(solution_count);
+
+        status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                     w_desc.get(),
+                                                     x_desc.get(),
+                                                     cd.get(),
+                                                     y_desc.get(),
+                                                     solution_count,
+                                                     &solution_count,
+                                                     solutions.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution failed");
+
+        solution_id = solutions.front().solution_id;
+
+        return shape{shape::int8_type, {perf.memory}};
+#endif
+    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            (void)(ctx); // avoid warnings
+            (void)(output_shape);
+            (void)(inputs);
+            // load solution
+            if(solution_ptr == nullptr)
+            {
+                miopenSolution_t ptr;
+                auto status =
+                    miopenLoadSolution(&ptr,
+                                       reinterpret_cast<const char*>(solution_object.data()),
+                                       solution_object.size());
+                solution_ptr = miopen_solution{ptr};
+                if(status != miopenStatusSuccess)
+                    MIGRAPHX_THROW("MIOpen " + op.name() + ": loading convolution solution failed");
+            }
+        }
+#else
+        // Use immediate mode API
+        {
+            set_conv_descriptor();
+            if(solution_id == 0)
+            {
+                // Check that workspace hasn't changed
+                auto size = inputs.at(2).bytes();
+                auto ws   = find(ctx, output_shape, inputs);
+                if(ws.bytes() > size)
+                    MIGRAPHX_THROW("MIOpen " + op.name() +
+                                   ": workspace has changed during finalization.");
+            }
+
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+            auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                                  w_desc.get(),
+                                                                  x_desc.get(),
+                                                                  cd.get(),
+                                                                  y_desc.get(),
+                                                                  solution_id);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+        }
+#endif
+    }
+
+    inline std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/arg_op.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
new file mode 100644
index 000000000..db8505b09
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
@@ -0,0 +1,172 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T>
+struct val_index
+{
+    T val;
+    int64_t index;
+};
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v)
+{
+    return {v, -1};
+}
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v, int64_t i)
+{
+    return {v, i};
+}
+
+struct argmax_op_first_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+
+struct argmax_op_last_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index > y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+
+struct argmin_op_first_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+
+struct argmin_op_last_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index > y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+
+template <class Op>
+void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    auto arg_shape        = arg.get_shape();
+    auto batch_lens       = arg_shape.lens();
+    size_t batch_item_num = batch_lens[axis];
+    batch_lens[axis]      = 1;
+    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
+    migraphx::shape std_arg_shape{arg_shape.type(), arg_shape.lens()};
+
+    hip_visit_all(arg, std_arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
+        auto* output = device_cast(result.get<int64_t>().data());
+        using type   = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        // use one block for items in one batch.
+        const size_t max_block_size  = 256;
+        const std::size_t block_size = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto batch_idx = batch_s.multi(i / block_size);
+            auto data_idx  = batch_idx;
+            auto init      = make_val_index<type>(op.init());
+
+            auto op_output =
+                block_reduce<max_block_size>(idx, op, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return make_val_index(input[arg_s.index(data_idx)], j);
+                });
+
+            if(idx.local == 0)
+            {
+                output[batch_s.index(batch_idx)] = op_output.index;
+            }
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmax.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmax.hpp
new file mode 100644
index 000000000..be6023737
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmax.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT argmax(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis,
+                                   bool select_last_index);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmin.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmin.hpp
new file mode 100644
index 000000000..c205fcf72
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/argmin.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT argmin(hipStream_t stream,
+                                   const argument& result,
+                                   const argument& arg,
+                                   int64_t axis,
+                                   bool select_last_index);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/config.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/config.hpp
new file mode 100644
index 000000000..014a5f3a3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/config.hpp
@@ -0,0 +1,30 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/export.h>
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/contiguous.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
new file mode 100644
index 000000000..5012955de
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/contiguous.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_KERNELS_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_KERNELS_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT contiguous(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/fill.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/fill.hpp
new file mode 100644
index 000000000..643b26b2e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/fill.hpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT fill(hipStream_t stream, const argument& result, unsigned long val);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
new file mode 100644
index 000000000..0f08b84b5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT logsoftmax(hipStream_t stream,
+                                       const argument& result,
+                                       const argument& arg,
+                                       int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/multinomial.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
new file mode 100644
index 000000000..7998945b0
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT multinomial(hipStream_t stream,
+                                        const argument& result,
+                                        const argument& arg0,
+                                        const argument& arg1);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/nonzero.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
new file mode 100644
index 000000000..a470a337a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument MIGRAPHX_DEVICE_EXPORT nonzero(hipStream_t stream,
+                                        const argument& result,
+                                        const argument& arg_data);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
new file mode 100644
index 000000000..a51815ec4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+#define MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT prefix_scan_sum(hipStream_t stream,
+                                            const argument& result,
+                                            const argument& arg,
+                                            int32_t axis,
+                                            bool exclusive,
+                                            bool reverse);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/reverse.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/reverse.hpp
new file mode 100644
index 000000000..1414314e6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/reverse.hpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REVERSE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REVERSE_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument MIGRAPHX_DEVICE_EXPORT reverse(hipStream_t stream,
+                                        argument result,
+                                        argument arg1,
+                                        const std::vector<int64_t>& axes);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/rnn_variable_seq_lens.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/rnn_variable_seq_lens.hpp
new file mode 100644
index 000000000..950848057
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/rnn_variable_seq_lens.hpp
@@ -0,0 +1,58 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_RNN_VARIABLE_SEQ_LENS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_RNN_VARIABLE_SEQ_LENS_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_shift_sequence(hipStream_t stream,
+                                                      const argument& result,
+                                                      const argument& arg_hs,
+                                                      const argument& arg_sl);
+
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_shift_output(hipStream_t stream,
+                                                    const argument& result,
+                                                    const argument& arg_hs,
+                                                    const argument& arg_sl,
+                                                    bool is_reverse);
+
+void MIGRAPHX_DEVICE_EXPORT rnn_var_sl_last_output(hipStream_t stream,
+                                                   const argument& result,
+                                                   const argument& arg_hs,
+                                                   const argument& arg_sl,
+                                                   bool is_reverse);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/topk.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/topk.hpp
new file mode 100644
index 000000000..b1fb4e8e2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device/topk.hpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument MIGRAPHX_DEVICE_EXPORT topk_smallest(hipStream_t stream,
+                                              const argument& val_res,
+                                              const argument& ind_res,
+                                              const argument& arg,
+                                              int64_t k,
+                                              int64_t axis);
+
+argument MIGRAPHX_DEVICE_EXPORT topk_largest(hipStream_t stream,
+                                             const argument& val_res,
+                                             const argument& ind_res,
+                                             const argument& arg,
+                                             int64_t k,
+                                             int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device_name.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device_name.hpp
new file mode 100644
index 000000000..bdd9530aa
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
+#define MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <string>
+
+struct hipDeviceProp_t;
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_GPU_EXPORT std::string get_device_name();
+
+MIGRAPHX_GPU_EXPORT int get_device_id();
+
+MIGRAPHX_GPU_EXPORT bool gfx_has_fp8fnuz_intrinsics();
+
+MIGRAPHX_GPU_EXPORT bool gfx_has_fp8ocp_intrinsics();
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ck.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
new file mode 100644
index 000000000..ee726b5b7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+struct fuse_ck
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_ck"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
new file mode 100644
index 000000000..e1cb8f0bb
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
+
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+MIGRAPHX_GPU_EXPORT bool mlir_enabled();
+
+struct MIGRAPHX_GPU_EXPORT fuse_mlir
+{
+    context* ctx      = nullptr;
+    bool enable_extra = false;
+    std::string name() const { return "gpu::fuse_mlir"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ops.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
new file mode 100644
index 000000000..fc8ef2256
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP
+
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT fuse_ops
+{
+    context* ctx   = nullptr;
+    bool fast_math = true;
+    std::string name() const { return "gpu::fuse_ops"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm.hpp
new file mode 100644
index 000000000..23f053dd5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm.hpp
@@ -0,0 +1,163 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
+
+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/gemm_impl.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/ranges.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+shape transpose_batch(const shape& s, unsigned trans_batch);
+void blas_shape(const shape& s);
+
+template <class Op>
+struct rocblas_gemm
+{
+    Op op;
+    float alpha          = 1;
+    float beta           = 0;
+    bool compute_fp32    = false;
+    unsigned trans_batch = 0;
+    int32_t solution_idx = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(migraphx::reflect(self.op, f),
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.compute_fp32, "compute_fp32"),
+                              f(self.trans_batch, "trans_batch"),
+                              f(self.solution_idx, "solution_idx")));
+    }
+
+    std::string name() const
+    {
+        if(contains(op.name(), "quant_"))
+        {
+            return "gpu::quant_gemm";
+        }
+        return "gpu::gemm";
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> in_shapes(inputs);
+        in_shapes.pop_back();
+        // When input shapes are A, B, C the GEMM equation is  C  =  α AB+ β C   where α, β are
+        // scalars
+        check_shapes{in_shapes, *this}.has(2, 3);
+        blas_shape(inputs[0]);
+        blas_shape(inputs[1]);
+        // if gemm and add are fused
+        if(in_shapes.size() > 2)
+        {
+            auto cmat_shape = in_shapes.back();
+            check_shapes{{cmat_shape}, *this}.not_transposed().not_broadcasted();
+            in_shapes.pop_back();
+            blas_shape(cmat_shape);
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
+            return transpose_batch(op_out_shape, trans_batch);
+        }
+
+        return transpose_batch(op.compute_shape(in_shapes), trans_batch);
+    }
+
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        if(this->name() == "gpu::gemm" or output_shape.type() == migraphx::shape::float_type)
+        {
+            gemm_compute(ctx, output_shape, args, alpha, beta, compute_fp32, solution_idx);
+        }
+        else
+        {
+            gemm_compute(
+                ctx, output_shape, args, int32_t(alpha), int32_t(beta), compute_fp32, solution_idx);
+        }
+        return args.back();
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& input_shapes)
+    {
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+        if(solution_idx == 0)
+            solution_idx = gemm_default_solution(ctx, output_shape, input_shapes);
+        if(enabled(MIGRAPHX_ENABLE_GEMM_TUNING{}) or ctx.get_exhaustive_tune_flag())
+        {
+            if(this->name() == "gpu::gemm")
+            {
+                solution_idx = gemm_finalize(
+                    ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
+            }
+            else
+            {
+                solution_idx = gemm_finalize(ctx,
+                                             output_shape,
+                                             input_shapes,
+                                             int32_t(alpha),
+                                             int32_t(beta),
+                                             compute_fp32,
+                                             solution_idx);
+            }
+        }
+#else
+        // suppress compiler warnings
+        (void)ctx, (void)output_shape, (void)input_shapes;
+#endif // MIGRAPHX_USE_ROCBLAS_TUNING_API
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_impl.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
new file mode 100644
index 000000000..76891cbb8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
@@ -0,0 +1,94 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
+
+#include <iterator>
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/context.hpp>
+
+// Set this environment variable to "true" to perform GEMM tuning even when the
+// --exhaustive-tune option isn't set.  Can be used to skip slow convolution tuning.
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_GEMM_TUNING);
+
+using milliseconds = std::chrono::duration<double, std::milli>;
+using microseconds = std::chrono::duration<double, std::micro>;
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+/**
+ * @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
+ *        For each function there are overloads using either float or int32_t for the arguments
+ * alpha and beta.
+ *
+ * @param ctx .
+ * @param output_shape .
+ * @param args .
+ * @param alpha .
+ * @param beta .
+ * @param compute_fp32 .
+ */
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  float alpha,
+                  float beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  int32_t alpha,
+                  int32_t beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      float alpha,
+                      float beta,
+                      bool compute_fp32);
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      int32_t alpha,
+                      int32_t beta,
+                      bool compute_fp32,
+                      int32_t solution_idx);
+
+int32_t gemm_default_solution(context& ctx,
+                              const shape& output_shape,
+                              const std::vector<shape>& input_shapes);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
new file mode 100644
index 000000000..6a63bde37
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
@@ -0,0 +1,117 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+#define MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+
+#include <migraphx/make_op.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <sstream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct gemm_softmax_gemm
+{
+    operation op = make_op("dot");
+    float scale  = 1.0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"), f(self.scale, "scale"));
+    }
+
+    std::string name() const { return "gpu::gemm_softmax_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1) and
+           not s.scalar())
+            MIGRAPHX_THROW("Invalid shape for " + name());
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>&) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 3)
+            MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size()));
+
+        const bool is_bias_enabled = inputs.size() == 4;
+        const bool is_mul_where    = inputs.size() == 5;
+        auto a                     = inputs[0];
+        auto b                     = inputs[1];
+        auto b1                    = inputs.back();
+
+        for(const auto& input : inputs)
+        {
+            check_gemm_shape(input);
+        }
+        auto gemm0_shape = op.compute_shape({a, b});
+        if(is_mul_where)
+        {
+            auto select_cond  = inputs[2];
+            auto select_const = inputs[3];
+            if(select_cond.lens() != select_const.lens())
+            {
+                std::stringstream err_msg;
+                err_msg << name() << ": has inconsistent where op condition and constant size: "
+                        << select_cond << "!=" << select_const;
+                MIGRAPHX_THROW(err_msg.str());
+            }
+            if(select_cond.lens() != gemm0_shape.lens())
+            {
+                std::stringstream err_msg;
+                err_msg << name() << ": has inconsistent where op condition size"
+                        << ". Expected: " << gemm0_shape << ". Given: " << select_cond;
+                MIGRAPHX_THROW(err_msg.str());
+            }
+        }
+        if(is_bias_enabled)
+        {
+            auto bias_shape = inputs[2];
+            if(bias_shape.lens() != gemm0_shape.lens())
+            {
+                std::stringstream err_msg;
+                err_msg << name() << ": has inconsistent bias size"
+                        << ". Expected: " << gemm0_shape << ". Given: " << bias_shape;
+                MIGRAPHX_THROW(err_msg.str());
+            }
+        }
+
+        return op.compute_shape({gemm0_shape, b1});
+    }
+
+    static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); }
+    static bool is_mlir_supported_type(shape::type_t t)
+    {
+        return contains({shape::type_t::float_type, shape::half_type}, t);
+    }
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/group_query_attention.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/group_query_attention.hpp
new file mode 100644
index 000000000..b7690c4bf
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/group_query_attention.hpp
@@ -0,0 +1,135 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_GROUP_QUERY_ATTENTION_HPP
+#define MIGRAPHX_GUARD_GPU_GROUP_QUERY_ATTENTION_HPP
+
+#include <migraphx/stringutils.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct gqa_parameters
+{
+    float scale;
+    std::uint32_t batch_size;              // Batch size used by input
+    std::uint32_t sequence_length;         // Sequence length used by input
+    std::uint32_t hidden_size;             // Hidden size used by input
+    std::uint32_t head_size;               // Head size
+    std::uint32_t rotary_embedding_dim;    // Rotary embedding dimension.
+    std::uint32_t num_heads;               // num_heads = hidden_size / head_size
+    std::uint32_t max_sequence_length;     // Sequence length used by cos/sin cache
+    std::uint32_t head_stride;             // Head stride
+    std::uint32_t seq_stride;              // Sequence stride
+    std::uint32_t batch_stride;            // Batch stride
+    std::uint32_t position_ids_format;     // Format of position ids - 0 is (1), 1 is (batch_size,
+                                           // sequence_length)
+    std::uint32_t seqlen_present_kv_cache; // Sequence length of present kv-cache (4096 when using
+                                           // shared buffer)
+    bool do_rotary;             // Whether to use rotary position embedding. Default value is 0.
+    std::uint32_t kv_num_heads; // Number of attention heads for k and v
+    int local_window_size;      // left_window_size for local attention. Default value is -1 meaning
+                                // unused.
+    bool rotary_interleaved;    // Rotate using interleaved pattern. Default value is 0 (False).
+    bool past_present_share_buffer; // Whether to use same buffer for KV-cache inputs and outputs
+
+    std::string make_init_str() const
+    {
+        return "MIGRAPHX_MAKE_CONSTANT(float{" + std::to_string(scale) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(batch_size) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(sequence_length) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(hidden_size) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(head_size) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(rotary_embedding_dim) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(num_heads) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(max_sequence_length) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(head_stride) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(seq_stride) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(batch_stride) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(position_ids_format) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(seqlen_present_kv_cache) +
+               "}), " + "MIGRAPHX_MAKE_CONSTANT(bool{" +
+               std::to_string(static_cast<int>(do_rotary)) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(uint32_t{" + std::to_string(kv_num_heads) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(int32_t{" + std::to_string(local_window_size) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(bool{" +
+               std::to_string(static_cast<int>(rotary_interleaved)) + "}), " +
+               "MIGRAPHX_MAKE_CONSTANT(bool{" +
+               std::to_string(static_cast<int>(past_present_share_buffer)) + "})";
+    }
+};
+
+static inline gqa_parameters init_params(const std::vector<shape>& inputs, const value& v)
+{
+    auto num_heads          = v.at("num_heads").to<std::uint32_t>();
+    auto kv_num_heads       = v.at("kv_num_heads").to<std::uint32_t>();
+    auto do_rotary          = v.at("do_rotary").to<bool>();
+    auto local_window_size  = v.at("local_window_size").to<std::uint32_t>();
+    auto rotary_interleaved = v.at("rotary_interleaved").to<bool>();
+    auto scale              = v.at("scale").to<float>();
+    auto present_kv_seqlen  = inputs[1].lens().size() == 4 ? inputs[1].lens()[2] : 0;
+
+    const auto& q_shape               = inputs[0];
+    auto q_lens                       = q_shape.lens();
+    const std::size_t batch_size      = q_lens[0];
+    const std::size_t sequence_length = q_lens[2];
+    std::size_t head_size             = q_lens[3];
+    auto q_hidden_size                = kv_num_heads * head_size;
+
+    std::size_t rotary_dim         = inputs[3].lens()[1] * 2;
+    auto seq_stride                = head_size;
+    auto head_stride               = sequence_length * seq_stride;
+    auto batch_stride              = (num_heads + 2 * kv_num_heads) * head_stride;
+    auto position_ids_format       = sequence_length == 1 ? 1 : 0;
+    bool past_present_share_buffer = true;
+    gqa_parameters gqa_params;
+    gqa_params.batch_size                = batch_size;
+    gqa_params.sequence_length           = sequence_length;
+    gqa_params.hidden_size               = q_hidden_size;
+    gqa_params.head_size                 = head_size;
+    gqa_params.rotary_embedding_dim      = rotary_dim;
+    gqa_params.num_heads                 = num_heads;
+    gqa_params.max_sequence_length       = sequence_length;
+    gqa_params.seq_stride                = head_size;
+    gqa_params.head_stride               = head_stride;
+    gqa_params.batch_stride              = batch_stride;
+    gqa_params.position_ids_format       = position_ids_format;
+    gqa_params.seqlen_present_kv_cache   = present_kv_seqlen;
+    gqa_params.do_rotary                 = do_rotary;
+    gqa_params.kv_num_heads              = kv_num_heads;
+    gqa_params.local_window_size         = local_window_size;
+    gqa_params.rotary_interleaved        = rotary_interleaved;
+    gqa_params.scale                     = scale;
+    gqa_params.past_present_share_buffer = past_present_share_buffer;
+
+    return gqa_params;
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_GROUP_QUERY_ATTENTION_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip.hpp
new file mode 100644
index 000000000..acd7525d6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -0,0 +1,288 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_HIP_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/dyn_output.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+MIGRAPHX_GPU_EXPORT std::string hip_error(int error);
+
+MIGRAPHX_GPU_EXPORT argument allocate_gpu(const shape& s, bool host = false);
+
+MIGRAPHX_GPU_EXPORT argument register_on_gpu(const argument& arg);
+
+MIGRAPHX_GPU_EXPORT argument to_gpu(const argument& arg, bool host = false);
+
+MIGRAPHX_GPU_EXPORT argument from_gpu(const argument& arg);
+
+MIGRAPHX_GPU_EXPORT void set_device(std::size_t id);
+
+MIGRAPHX_GPU_EXPORT void gpu_sync();
+MIGRAPHX_GPU_EXPORT void gpu_sync(const context& ctx);
+
+MIGRAPHX_GPU_EXPORT void gpu_copy(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
+MIGRAPHX_GPU_EXPORT void copy_from_gpu(context& ctx, const argument& src, const argument& dst);
+
+MIGRAPHX_GPU_EXPORT argument get_preallocation(context& ctx, const std::string& id);
+
+MIGRAPHX_GPU_EXPORT void gpu_fill(context& ctx, const argument& dst, int value = 0);
+
+struct hip_allocate
+{
+    shape s;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
+
+    std::string name() const { return "hip::allocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
+    {
+        return allocate_gpu(output_shape);
+    }
+};
+
+struct hip_fill
+{
+    int value = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.value, "value"));
+    }
+
+    std::string name() const { return "hip::fill"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        return inputs.front();
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        gpu_fill(ctx, args.front(), value);
+        return args.front();
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
+};
+
+struct hip_sync_stream
+{
+
+    std::string name() const { return "hip::sync_stream"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        return inputs.front();
+    }
+
+    argument compute(const context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        gpu_sync(ctx);
+        if(args.empty())
+            return {};
+        return args.front();
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& args) const
+    {
+        if(args.empty())
+            return -1;
+        return 0;
+    }
+};
+
+struct hip_copy_to_gpu
+{
+    std::string name() const { return "hip::copy_to_gpu"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1, 2).same_type();
+        return inputs.at(0);
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        auto input = register_on_gpu(args[0]);
+        if(args.size() == 1)
+            return input;
+        argument result = args[1].share();
+        if(result.get_shape().dynamic())
+        {
+            result = result.reshape(args[0].get_shape());
+        }
+        gpu_copy(ctx, input, result);
+        // Associate the input since it was registered with hip
+        return {result.get_shape(), [input, result]() mutable { return result.data(); }};
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& args) const
+    {
+        if(args.size() == 1)
+            return -1;
+        return 1;
+    }
+};
+
+struct hip_copy_from_gpu
+{
+    std::string name() const { return "hip::copy_from_gpu"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1, 2).same_type();
+        return inputs.at(0);
+    }
+    argument
+    compute(context& ctx, const dyn_output& dyn_out, const std::vector<argument>& args) const
+    {
+        if(args.size() == 1)
+        {
+            argument result = allocate_gpu(dyn_out.computed_shape, true);
+            gpu_copy(ctx, args[0], result);
+            return result;
+        }
+        argument input = args[0].share();
+        if(input.get_shape().dynamic())
+        {
+            input = input.reshape(args[1].get_shape());
+        }
+        copy_from_gpu(ctx, input, args[1]);
+        return args[1];
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& args) const
+    {
+        if(args.size() == 1)
+            return -1;
+        return 1;
+    }
+};
+
+struct hip_copy
+{
+    std::string name() const { return "hip::copy"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).same_type();
+        return inputs.at(1);
+    }
+    argument compute(context& ctx, const shape&, std::vector<argument> args) const
+    {
+        gpu_copy(ctx, args[0], args[1]);
+        return args[1];
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 1; }
+};
+
+MIGRAPHX_GPU_EXPORT void
+store_preallocated_param(context& ctx, const std::string& id, const argument& a);
+
+struct hip_allocate_memory
+{
+    shape s;
+    std::string id{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.id, "id"));
+    }
+
+    std::string name() const { return "hip::hip_allocate_memory"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        return get_preallocation(ctx, id);
+    }
+
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) const
+    {
+        argument a = allocate_gpu(s);
+        store_preallocated_param(ctx, id, a);
+    }
+};
+
+struct hip_copy_literal
+{
+    literal l;
+    std::string id{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.l, "literal"), f(self.id, "id"));
+    }
+
+    std::string name() const { return "hip::hip_copy_literal"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return l.get_shape();
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        return get_preallocation(ctx, id);
+    }
+
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) const
+    {
+        argument a = to_gpu(l.get_argument());
+        store_preallocated_param(ctx, id, a);
+    }
+    friend std::ostream& operator<<(std::ostream& os, const hip_copy_literal& x)
+    {
+        os << x.name() << "[id=" << x.id << "]";
+        return os;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm.hpp
new file mode 100644
index 000000000..8c3d67bcd
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm.hpp
@@ -0,0 +1,146 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_HIP_GEMM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_HIP_GEMM_HPP
+
+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/hip_gemm_impl.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/ranges.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+void blas_shape_hip(const shape& s);
+shape transpose_batch_hip(const shape& s, unsigned trans_batch);
+
+template <class Op>
+struct hip_gemm
+{
+    Op op;
+    float alpha          = 1;
+    float beta           = 0;
+    unsigned trans_batch = 0;
+    int32_t solution_idx = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack_join(migraphx::reflect(self.op, f),
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.trans_batch, "trans_batch"),
+                              f(self.solution_idx, "solution_idx")));
+    }
+
+    std::string name() const
+    {
+        if(contains(op.name(), "quant_"))
+        {
+            return "gpu::hip_quant_gemm";
+        }
+        return "gpu::hip_gemm";
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> in_shapes(inputs);
+        in_shapes.pop_back();
+        in_shapes.pop_back();
+        // When input shapes are A, B, C the GEMM equation is  C  =  α AB+ β C   where α, β are
+        // scalars
+        check_shapes{in_shapes, *this}.has(2, 3);
+        blas_shape_hip(inputs[0]);
+        blas_shape_hip(inputs[1]);
+        // if gemm and add are fused
+        if(in_shapes.size() > 2)
+        {
+            auto cmat_shape = in_shapes.back();
+            check_shapes{{cmat_shape}, *this}.not_transposed().not_broadcasted();
+            in_shapes.pop_back();
+            blas_shape_hip(cmat_shape);
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
+            return transpose_batch_hip(op_out_shape, trans_batch);
+        }
+
+        return transpose_batch_hip(op.compute_shape(in_shapes), trans_batch);
+    }
+
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        hip_gemm_compute(ctx, output_shape, args, alpha, beta, solution_idx);
+        return args.back();
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& input_shapes)
+    {
+        if(solution_idx == 0)
+            solution_idx = hip_gemm_default_solution(ctx, output_shape, input_shapes);
+        if(enabled(MIGRAPHX_ENABLE_HIP_GEMM_TUNING{}) or ctx.get_exhaustive_tune_flag())
+        {
+            solution_idx =
+                hip_gemm_finalize(ctx, output_shape, input_shapes, alpha, beta, solution_idx);
+        }
+    }
+
+    value
+    compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input_shapes)
+    {
+        finalize(any_cast<migraphx::gpu::context>(ctx), output, input_shapes);
+        size_t ws = hip_gemm_workspace_size(
+            any_cast<migraphx::gpu::context>(ctx), output, input_shapes, alpha, beta, solution_idx);
+        return {{"workspace", ws}};
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_RTGLIB_GPU_HIP_GEMM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm_impl.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm_impl.hpp
new file mode 100644
index 000000000..f26d594d8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hip_gemm_impl.hpp
@@ -0,0 +1,82 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_HIP_GEMM_IMPL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_HIP_GEMM_IMPL_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/context.hpp>
+
+// Set this environment variable to "true" to perform GEMM tuning even when the
+// --exhaustive-tune option isn't set.  Can be used to skip slow convolution tuning.
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIP_GEMM_TUNING);
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using milliseconds = std::chrono::duration<double, std::milli>;
+using microseconds = std::chrono::duration<double, std::micro>;
+
+/**
+ * @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
+ *        For each function there are overloads using either float or int32_t for the arguments
+ * alpha and beta.
+ *
+ * @param ctx .
+ * @param output_shape .
+ * @param args .
+ * @param alpha .
+ * @param beta .
+ */
+void hip_gemm_compute(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<argument>& args,
+                      float alpha,
+                      float beta,
+                      int32_t solution_idx);
+
+int32_t hip_gemm_finalize(context& ctx,
+                          const shape& output_shape,
+                          const std::vector<shape>& input_shapes,
+                          float alpha,
+                          float beta,
+                          int32_t solution_idx);
+
+int32_t hip_gemm_default_solution(context& ctx,
+                                  const shape& output_shape,
+                                  const std::vector<shape>& input_shapes);
+
+size_t hip_gemm_workspace_size(context& ctx,
+                               const shape& output_shape,
+                               const std::vector<shape>& input_shapes,
+                               float alpha,
+                               float beta,
+                               int32_t solution_idx);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hipblaslt.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hipblaslt.hpp
new file mode 100644
index 000000000..49d41bf4d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/hipblaslt.hpp
@@ -0,0 +1,109 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_HIPBLASLT_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_HIPBLASLT_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/errors.hpp>
+#if MIGRAPHX_USE_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// TODO: Remove hipblas_status_to_string() function when hipblaslt
+// provides an API for doing this in hipBLASLt.
+
+// Convert hipblas_status to string
+inline const char* hipblas_status_to_string(hipblasStatus_t status)
+{
+    switch(status)
+    {
+    case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS";
+    case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED";
+    case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED";
+    case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE";
+    case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR";
+    case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED";
+    case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR";
+    case HIPBLAS_STATUS_NOT_SUPPORTED: return "HIPBLAS_STATUS_NOT_SUPPORTED";
+    case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH";
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: return "HIPBLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM: return "HIPBLAS_STATUS_INVALID_ENUM";
+    case HIPBLAS_STATUS_UNKNOWN: return "HIPBLAS_STATUS_UNKNOWN";
+    }
+    return "<undefined hipblasStatus_t value>";
+}
+
+template <class F, class... Ts>
+inline auto hipblaslt_invoke(F f, Ts... xs)
+{
+    // Call the function `f` with `xs...` and capture the status
+    auto status = f(xs...);
+
+    if(status != HIPBLAS_STATUS_SUCCESS)
+    {
+        std::string error_message =
+            "hipBLAS error: '" + std::string(hipblas_status_to_string(status)) + "'(" +
+            std::to_string(status) + ") at " + __FILE__ + ":" + std::to_string(__LINE__);
+        MIGRAPHX_THROW(EXIT_FAILURE, error_message);
+    }
+    return status;
+}
+
+// Invoke a hipBLASLt call. If used to validate a call, set fatal_error = false to prevent
+// throwing an exception on failure.
+template <class F, class Pack, class... Ts>
+auto hipblaslt_invoke(F f, Pack p, Ts... xs, bool fatal_error = true)
+{
+    return p([=](auto... ws) {
+        auto status = f(ws..., xs...);
+        if(status != HIPBLAS_STATUS_SUCCESS)
+        {
+            if(fatal_error)
+            {
+                MIGRAPHX_THROW("hipblaslt_invoke: hipBlasLt call failed with status " +
+                               std::to_string(status));
+            }
+        }
+        return status;
+    });
+}
+
+using hipblaslt_handle_ptr     = MIGRAPHX_MANAGE_PTR(hipblasLtHandle_t, hipblasLtDestroy);
+using hipblaslt_preference_ptr = MIGRAPHX_MANAGE_PTR(hipblasLtMatmulPreference_t,
+                                                     hipblasLtMatmulPreferenceDestroy);
+
+hipblaslt_handle_ptr create_hipblaslt_handle_ptr();
+hipblaslt_preference_ptr create_hipblaslt_preference_ptr();
+bool hipblaslt_supported();
+const size_t hipblaslt_workspace_size = 2 * 128 * 1024 * 1024;
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_USE_HIPBLASLT
+#endif // MIGRAPHX_GUARD_MIGRAPHLIB_HIPBLASLT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/kernel.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/kernel.hpp
new file mode 100644
index 000000000..63accdea4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -0,0 +1,80 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_KERNEL_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/gpu/pack_args.hpp>
+#include <hip/hip_runtime_api.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct kernel_impl;
+
+struct MIGRAPHX_GPU_EXPORT kernel
+{
+    kernel() = default;
+    kernel(const char* image, const std::string& name);
+    template <class T, MIGRAPHX_REQUIRES(sizeof(T) == 1)>
+    kernel(const std::vector<T>& image, const std::string& name)
+        : kernel(reinterpret_cast<const char*>(image.data()), name)
+    {
+    }
+
+    void launch(hipStream_t stream,
+                std::size_t global,
+                std::size_t local,
+                const std::vector<kernel_argument>& args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
+
+    void launch(hipStream_t stream,
+                std::size_t global,
+                std::size_t local,
+                std::vector<void*> args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
+
+    template <class... Ts>
+    auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
+    {
+        return [=](auto&&... xs) {
+            launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
+        };
+    }
+
+    private:
+    std::shared_ptr<kernel_impl> impl;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/logsoftmax.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
new file mode 100644
index 000000000..5ea23ee27
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
@@ -0,0 +1,60 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
+
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct hip_logsoftmax
+{
+    op::logsoftmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::logsoftmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/loop.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/loop.hpp
new file mode 100644
index 000000000..792c84b74
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/loop.hpp
@@ -0,0 +1,66 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_LOOP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LOOP_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/loop.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_loop
+{
+    op::loop op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::loop"; }
+    shape compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const;
+    argument
+    compute(context& ctx,
+            const shape& output_shape,
+            const std::vector<argument>& args,
+            const std::vector<module_ref>& mods,
+            const std::function<std::vector<argument>(
+                module_ref&, const std::unordered_map<std::string, argument>&)>& run) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lowering.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lowering.hpp
new file mode 100644
index 000000000..6f4a3ca3e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lowering.hpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+/**
+ * Compiler pass that makes GPU-specific instruction changes.
+ * * Copies to and from the device if `offload_copy` is true.
+ * * Maps instructions to their GPU-specific counterparts.
+ * * Inserts `allocate` instructions before GPU operators.
+ */
+struct MIGRAPHX_GPU_EXPORT lowering
+{
+    context* ctx;
+    bool offload_copy;
+    std::string name() const { return "gpu::lowering"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lrn.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lrn.hpp
new file mode 100644
index 000000000..8ccda7bba
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/lrn.hpp
@@ -0,0 +1,63 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+#if MIGRAPHX_USE_MIOPEN
+struct miopen_lrn
+{
+    op::lrn op;
+    shared<lrn_descriptor> ldesc;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/miopen.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/miopen.hpp
new file mode 100644
index 000000000..87a561ad6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -0,0 +1,343 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_HPP
+
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/config.hpp>
+#if MIGRAPHX_USE_MIOPEN
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/op/lrn.hpp>
+#include <miopen/miopen.h>
+
+#include <sstream>
+
+#ifdef MIGRAPHX_HAS_FIND_MODE_API
+extern "C" miopenStatus_t
+miopenHiddenSetConvolutionFindMode(miopenConvolutionDescriptor_t convDesc, // NOLINT
+                                   int findMode);                          // NOLINT
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+using miopen_handle          = MIGRAPHX_MANAGE_PTR(miopenHandle_t, miopenDestroy);
+using tensor_descriptor      = MIGRAPHX_MANAGE_PTR(miopenTensorDescriptor_t,
+                                              miopenDestroyTensorDescriptor);
+using convolution_descriptor = MIGRAPHX_MANAGE_PTR(miopenConvolutionDescriptor_t,
+                                                   miopenDestroyConvolutionDescriptor);
+using pooling_descriptor     = MIGRAPHX_MANAGE_PTR(miopenPoolingDescriptor_t,
+                                               miopenDestroyPoolingDescriptor);
+using activation_descriptor  = MIGRAPHX_MANAGE_PTR(miopenActivationDescriptor_t,
+                                                  miopenDestroyActivationDescriptor);
+using fusion_plan_descriptor = MIGRAPHX_MANAGE_PTR(miopenFusionPlanDescriptor_t,
+                                                   miopenDestroyFusionPlan);
+using fused_operator_args    = MIGRAPHX_MANAGE_PTR(miopenOperatorArgs_t, miopenDestroyOperatorArgs);
+
+using lrn_descriptor = MIGRAPHX_MANAGE_PTR(miopenLRNDescriptor_t, miopenDestroyLRNDescriptor);
+
+template <class Result, class F, class... Ts>
+Result make_obj(F f, Ts... xs)
+{
+    typename Result::pointer x = nullptr;
+    auto status                = f(&x, xs...);
+    Result r{x};
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MAKE_OBJ: MIOpen call failed");
+    return r;
+}
+
+#ifdef MIGRAPHX_HAS_FIND_2_API
+using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestroyFindOptions);
+using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
+using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);
+
+inline miopen_solution find_solution(miopenHandle_t handle,
+                                     size_t num_inputs,
+                                     const miopenTensorArgument_t* tensor_args,
+                                     void* workspace,
+                                     size_t workspace_size,
+                                     miopenProblem_t problem,
+                                     bool tune = false)
+{
+    miopenSolution_t solution;
+    size_t found           = 0;
+    miopen_find_options fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
+    if(tune)
+    {
+        miopenSetFindOptionTuning(fo.get(), 1);
+    }
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+    for(auto i : range(num_inputs))
+    {
+        auto status = miopenSetFindOptionPreallocatedTensor(
+            fo.get(), tensor_args[i].id, tensor_args[i].buffer);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen: failed to preallocate tensors for the find process");
+    }
+    auto status = miopenSetFindOptionPreallocatedWorkspace(fo.get(), workspace, workspace_size);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen: failed to preallocate workspace for the find process");
+#else
+    miopenStatus_t status;
+    (void)(num_inputs);
+    (void)(tensor_args);
+    (void)(workspace_size);
+    (void)(workspace);
+#endif
+    status      = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
+    auto result = miopen_solution{solution};
+    if(status != miopenStatusSuccess or found == 0)
+        MIGRAPHX_THROW("MIOpen: miopenFindSolutions failed");
+    return result;
+}
+
+inline void set_tensor_descriptor(miopenTensorArgumentId_t name,
+                                  tensor_descriptor& desc,
+                                  miopen_problem& problem_ptr)
+{
+    auto status = miopenSetProblemTensorDescriptor(problem_ptr.get(), name, desc.get());
+    if(status != miopenStatusSuccess)
+    {
+        MIGRAPHX_THROW("setting problem tensor description failed");
+    }
+}
+#endif
+
+inline tensor_descriptor make_tensor(const migraphx::shape& os)
+{
+    auto s = os.normalize_standard();
+    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
+    // Convert to ints
+    std::vector<int> lens(s.lens().begin(), s.lens().end());
+    std::vector<int> strides(s.strides().begin(), s.strides().end());
+    miopenDataType_t d;
+    if(s.type() == shape::float_type)
+        d = miopenFloat;
+    else if(s.type() == shape::half_type)
+        d = miopenHalf;
+    else if(s.type() == shape::int32_type)
+        d = miopenInt32;
+    else if(s.type() == shape::int8_type)
+        d = miopenInt8;
+    else if(s.type() == shape::bf16_type)
+        d = miopenBFloat16;
+    else
+        MIGRAPHX_THROW("MAKE_TENSOR: unsupported type");
+    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());
+
+    return t;
+}
+
+template <class T>
+inline convolution_descriptor make_conv(const T& op)
+{
+    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
+    miopenConvolutionMode_t c_mode = miopenConvolution;
+    if(op.group > 1)
+        c_mode = miopenGroupConv;
+
+    int kdims = op.kdims();
+    std::vector<int> padding(std::max(2, kdims), 0);
+    std::vector<int> stride(std::max(2, kdims), 1);
+    std::vector<int> dilation(std::max(2, kdims), 1);
+
+    std::copy_backward(op.padding.begin(), op.padding.begin() + kdims, padding.end());
+    std::copy_backward(op.stride.begin(), op.stride.end(), stride.end());
+    std::copy_backward(op.dilation.begin(), op.dilation.end(), dilation.end());
+
+    miopenInitConvolutionNdDescriptor(
+        c.get(), padding.size(), padding.data(), stride.data(), dilation.data(), c_mode);
+    if(op.group > 1)
+        miopenSetConvolutionGroupCount(c.get(), op.group);
+#ifdef MIGRAPHX_HAS_FIND_MODE_API
+    miopenHiddenSetConvolutionFindMode(c.get(), 1); // Normal mode
+#endif
+    return c;
+}
+
+template <class T>
+inline convolution_descriptor make_convolution_backwards(const T& op)
+{
+    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
+    miopenConvolutionMode_t c_mode = miopenTranspose;
+    int kdims                      = op.kdims();
+    std::vector<int> padding(std::max(2, kdims), 0);
+    std::vector<int> stride(std::max(2, kdims), 1);
+    std::vector<int> dilation(std::max(2, kdims), 1);
+
+    std::copy_backward(op.padding.begin(), op.padding.end(), padding.end());
+    std::copy_backward(op.stride.begin(), op.stride.end(), stride.end());
+    std::copy_backward(op.dilation.begin(), op.dilation.end(), dilation.end());
+
+    miopenInitConvolutionNdDescriptor(
+        c.get(), padding.size(), padding.data(), stride.data(), dilation.data(), c_mode);
+    if(op.group > 1)
+        miopenSetConvolutionGroupCount(c.get(), op.group);
+    return c;
+}
+
+inline pooling_descriptor make_pooling(const migraphx::op::pooling& op)
+{
+    miopenPoolingMode_t mode;
+    if(op.mode == op::pooling_mode::max)
+        mode = miopenPoolingMax;
+    else if(op.mode == op::pooling_mode::average)
+        mode = miopenPoolingAverage;
+    else
+    {
+        std::stringstream ss("Unknown mode for pooling: ");
+        ss << op.mode;
+        MIGRAPHX_THROW(ss.str());
+    }
+    if(not std::all_of(
+           op.dilations.cbegin(), op.dilations.cend(), [](std::size_t d) { return d == 1; }))
+    {
+        MIGRAPHX_THROW("Unsupported dilations for pooling: [" + to_string_range(op.dilations) +
+                       "]");
+    }
+    auto p = make_obj<pooling_descriptor>(&miopenCreatePoolingDescriptor);
+
+    int kdims = op.kdims();
+    std::vector<int> padding(std::max(2, kdims), 0);
+    std::vector<int> stride(std::max(2, kdims), 1);
+    std::vector<int> lengths(std::max(2, kdims), 1);
+
+    std::copy_backward(op.padding.begin(), op.padding.begin() + kdims, padding.end());
+    std::copy_backward(op.stride.begin(), op.stride.end(), stride.end());
+    std::copy_backward(op.lengths.begin(), op.lengths.end(), lengths.end());
+
+    miopenSetNdPoolingDescriptor(
+        p.get(), mode, padding.size(), lengths.data(), padding.data(), stride.data());
+    return p;
+}
+
+inline lrn_descriptor make_lrn(const migraphx::op::lrn& op)
+{
+    auto ldesc = make_obj<lrn_descriptor>(&miopenCreateLRNDescriptor);
+    miopenSetLRNDescriptor(ldesc.get(), miopenLRNCrossChannel, op.size, op.alpha, op.beta, op.bias);
+    return ldesc;
+}
+
+inline activation_descriptor make_relu()
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationRELU, 0, 0, 0);
+    return ad;
+}
+
+inline activation_descriptor make_sigmoid()
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationLOGISTIC, 0, 0, 0);
+    return ad;
+}
+
+inline activation_descriptor make_tanh()
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    // onnx operator does not apply additional scaling for tanh
+    // defaults for alpha and beta are therefore set to 1
+    miopenSetActivationDescriptor(ad.get(), miopenActivationTANH, 1, 1, 0);
+    return ad;
+}
+
+inline activation_descriptor make_abs()
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationABS, 0, 0, 0);
+    return ad;
+}
+
+inline activation_descriptor make_leaky_relu(double alpha)
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationLEAKYRELU, alpha, 0, 0);
+    return ad;
+}
+
+inline activation_descriptor make_elu(double alpha)
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationELU, alpha, 0, 0);
+    return ad;
+}
+
+inline fusion_plan_descriptor make_fusion_plan(const shape& input)
+{
+    auto t = make_tensor(input);
+    return make_obj<fusion_plan_descriptor>(&miopenCreateFusionPlan, miopenVerticalFusion, t.get());
+}
+
+// Temporary hack to workaround memory problems in miopen
+inline fusion_plan_descriptor make_fusion_plan(const tensor_descriptor& input)
+{
+    return make_obj<fusion_plan_descriptor>(
+        &miopenCreateFusionPlan, miopenVerticalFusion, input.get());
+}
+
+inline fused_operator_args make_fused_args()
+{
+    return make_obj<fused_operator_args>(&miopenCreateOperatorArgs);
+}
+
+template <class F>
+auto reflect(miopenActivationDescriptor_t ad, F f)
+{
+    assert(ad != nullptr);
+    miopenActivationMode_t mode = miopenActivationPASTHRU;
+    double alpha                = 0.0;
+    double beta                 = 0.0;
+    double gamma                = 0.0;
+    miopenGetActivationDescriptor(ad, &mode, &alpha, &beta, &gamma);
+    return pack(f(std::move(mode), "mode"),    // NOLINT
+                f(std::move(alpha), "alpha"),  // NOLINT
+                f(std::move(beta), "beta"),    // NOLINT
+                f(std::move(gamma), "gamma")); // NOLINT
+}
+
+template <class F>
+auto reflect(miopenLRNDescriptor_t lrnd, F f)
+{
+    assert(lrnd != nullptr);
+    miopenLRNMode_t mode = miopenLRNWithinChannel;
+    unsigned int n       = 0;
+    double alpha         = 0.0;
+    double beta          = 0.0;
+    double k             = 0.0;
+    miopenGetLRNDescriptor(lrnd, &mode, &n, &alpha, &beta, &k);
+    return pack(f(std::move(mode), "mode"),   // NOLINT
+                f(std::move(n), "n"),         // NOLINT
+                f(std::move(alpha), "alpha"), // NOLINT
+                f(std::move(beta), "beta"),   // NOLINT
+                f(std::move(k), "k"));        // NOLINT
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/mlir.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/mlir.hpp
new file mode 100644
index 000000000..d1f19c1e8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -0,0 +1,80 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+
+#include <string>
+#include <vector>
+#include <migraphx/value.hpp>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(module m);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(module m, const std::vector<shape>& inputs);
+MIGRAPHX_GPU_EXPORT void
+dump_mlir_to_file(module m, const std::vector<shape>& inputs, const fs::path& location);
+
+MIGRAPHX_GPU_EXPORT bool
+is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution);
+
+struct MIGRAPHX_GPU_EXPORT mlir_code_object
+{
+    code_object_op cop;
+    std::vector<size_t> prefill_indices = {};
+    std::vector<value> prefill_values   = {};
+};
+
+MIGRAPHX_GPU_EXPORT bool is_reduce(const instruction& ins);
+
+MIGRAPHX_GPU_EXPORT mlir_code_object compile_mlir(const context& migraphx_ctx,
+                                                  module m,
+                                                  const std::vector<shape>& in_shapes,
+                                                  const value& solution);
+
+MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,
+                                                instruction_ref ins,
+                                                code_object_op co,
+                                                const std::vector<instruction_ref>& inputs);
+
+MIGRAPHX_GPU_EXPORT tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
+                                                         module m,
+                                                         const std::vector<shape>& inputs,
+                                                         bool exhaustive);
+
+MIGRAPHX_GPU_EXPORT void
+dump_mlir_to_mxr(module m, const std::vector<instruction_ref>& inputs, const fs::path& location);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/multinomial.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/multinomial.hpp
new file mode 100644
index 000000000..c44d48082
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/multinomial.hpp
@@ -0,0 +1,59 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+
+#include <migraphx/op/multinomial.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_multinomial
+{
+    op::multinomial op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/name.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/name.hpp
new file mode 100644
index 000000000..390d7ea0b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/name.hpp
@@ -0,0 +1,67 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_OP_NAME_HPP
+#define MIGRAPHX_GUARD_RTGLIB_OP_NAME_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/type_name.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+template <class Derived>
+struct oper
+{
+    // function to extract the name part of an operator. For example, we have
+    // a operation "sin", then the get_type_name() will return
+    // "migraphx::version_1::gpu::hip_sin", this functin will return the name
+    // "gpu::sin" as the operator name
+    std::string name() const
+    {
+        const std::string& name = get_type_name<Derived>();
+        // search the namespace gpu (::gpu::)
+        auto pos_ns = name.find("::gpu::");
+        if(pos_ns != std::string::npos)
+        {
+            auto pos_name = name.find("hip_", pos_ns + std::string("::gpu::").length());
+            if(pos_name != std::string::npos)
+            {
+                return std::string("gpu::") + name.substr(pos_name + 4);
+            }
+            else
+            {
+                return name.substr(pos_ns + 2);
+            }
+        }
+        return "unknown_operator_name";
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/nonzero.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/nonzero.hpp
new file mode 100644
index 000000000..cfc7e78db
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/nonzero.hpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/nonzero.hpp>
+#include <migraphx/gpu/miopen.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_nonzero
+{
+    op::nonzero op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/oper.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/oper.hpp
new file mode 100644
index 000000000..13ac11a3d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/oper.hpp
@@ -0,0 +1,168 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_UNARY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_UNARY_HPP
+
+#include <migraphx/gpu/name.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+template <class Derived, std::size_t N>
+struct device_base : oper<Derived>
+{
+    template <class Self, class F>
+    static auto reflect(Self&, F)
+    {
+        return pack();
+    }
+
+    std::vector<shape> reduce_shapes;
+
+    void finalize(context&, const shape&, const std::vector<shape>& inputs)
+    {
+        reduce_shapes = reduce_dims(inputs);
+    }
+
+    argument get_arg(const std::vector<argument>& args, std::size_t i) const
+    {
+        if(reduce_shapes.empty())
+            return args[i];
+        return args.at(i).reshape(reduce_shapes.at(i));
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(N + 1);
+        auto s0 = inputs.at(0);
+        if(std::all_of(inputs.begin(), inputs.end() - 1, [&](auto s) { return s == s0; }) and
+           s0.packed())
+            return s0;
+        else
+            return {s0.type(), s0.lens()};
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+template <class Derived, void (*F)(hipStream_t, const argument&, const argument&)>
+struct unary_device : device_base<Derived, 1>
+{
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(), this->get_arg(args, 1), this->get_arg(args, 0));
+        return args[1];
+    }
+};
+
+template <class Derived, void (*F)(hipStream_t, const argument&, const argument&, const argument&)>
+struct binary_device : device_base<Derived, 2>
+{
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(),
+          this->get_arg(args, 2),
+          this->get_arg(args, 0),
+          this->get_arg(args, 1));
+        return args[2];
+    }
+};
+
+template <class Derived,
+          void (*F)(
+              hipStream_t, const argument&, const argument&, const argument&, const argument&)>
+struct ternary_device : device_base<Derived, 3>
+{
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(),
+          this->get_arg(args, 3),
+          this->get_arg(args, 0),
+          this->get_arg(args, 1),
+          this->get_arg(args, 2));
+        return args[3];
+    }
+};
+
+template <class Derived,
+          void (*F)(hipStream_t,
+                    const argument&,
+                    const argument&,
+                    const argument&,
+                    const argument&,
+                    const argument&)>
+struct quaternary_device : device_base<Derived, 4>
+{
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(),
+          this->get_arg(args, 4),
+          this->get_arg(args, 0),
+          this->get_arg(args, 1),
+          this->get_arg(args, 2),
+          this->get_arg(args, 3));
+        return args[4];
+    }
+};
+
+template <class Derived,
+          void (*F)(hipStream_t,
+                    const argument&,
+                    const argument&,
+                    const argument&,
+                    const argument&,
+                    const argument&,
+                    const argument&)>
+struct quinary_device : device_base<Derived, 5>
+{
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(),
+          this->get_arg(args, 5),
+          this->get_arg(args, 0),
+          this->get_arg(args, 1),
+          this->get_arg(args, 2),
+          this->get_arg(args, 3),
+          this->get_arg(args, 4));
+        return args[5];
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pack_args.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pack_args.hpp
new file mode 100644
index 000000000..1896a3008
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pack_args.hpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PACK_ARGS_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <migraphx/requires.hpp>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct kernel_argument
+{
+    template <class T,
+              class U = std::remove_reference_t<T>,
+              MIGRAPHX_REQUIRES(not std::is_base_of<kernel_argument, T>{})>
+    kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
+    {
+    }
+    std::size_t size;
+    std::size_t align;
+    void* data;
+};
+
+MIGRAPHX_GPU_EXPORT std::vector<char> pack_args(const std::vector<kernel_argument>& args);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/perfdb.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/perfdb.hpp
new file mode 100644
index 000000000..21aed313c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/perfdb.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PERFDB_HPP
+#define MIGRAPHX_GUARD_GPU_PERFDB_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/operation.hpp>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct problem_params
+{
+    operation op;
+    std::vector<shape> inputs;
+    shape output;
+};
+
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PERFDB_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pooling.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pooling.hpp
new file mode 100644
index 000000000..7f6722b11
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/pooling.hpp
@@ -0,0 +1,64 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_POOLING_HPP
+#define MIGRAPHX_GUARD_RTGLIB_POOLING_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/gpu/miopen.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+#if MIGRAPHX_USE_MIOPEN
+struct miopen_pooling
+{
+    op::pooling op;
+    shared<pooling_descriptor> pd;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::pooling"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    void finalize(context&, const shape&, const std::vector<shape>&);
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+#endif
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
new file mode 100644
index 000000000..cca8efd60
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
@@ -0,0 +1,79 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
+#define MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
+
+#include <migraphx/gpu/name.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/prefix_scan_sum.hpp>
+#include <migraphx/op/prefix_scan_sum.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_prefix_scan_sum : oper<hip_prefix_scan_sum>
+{
+    op::prefix_scan_sum op;
+
+    template <class Self, class T>
+    static auto reflect(Self& self, T f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> in_shapes{inputs};
+        in_shapes.pop_back();
+        check_shapes{in_shapes, *this}.standard();
+        return op.normalize_compute_shape(in_shapes);
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        device::prefix_scan_sum(
+            ctx.get_stream().get(), args[1], args[0], op.axis, op.exclusive, op.reverse);
+        return args[1];
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
new file mode 100644
index 000000000..bed640520
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+
+#include <migraphx/gpu/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT prefuse_ops
+{
+    bool enable_attention = false;
+    std::string name() const { return "gpu::prefuse_ops"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prepare_reduce.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prepare_reduce.hpp
new file mode 100644
index 000000000..3c6bfdd42
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/prepare_reduce.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PREPARE_REDUCE_HPP
+#define MIGRAPHX_GUARD_GPU_PREPARE_REDUCE_HPP
+
+#include <migraphx/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+struct prepare_reduce
+{
+    std::string name() const { return "gpu::prepare_reduce"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREPARE_REDUCE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/problem_cache.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/problem_cache.hpp
new file mode 100644
index 000000000..d70e0687b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/problem_cache.hpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PROBLEM_CACHE_HPP
+#define MIGRAPHX_GUARD_GPU_PROBLEM_CACHE_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/optional.hpp>
+#include <migraphx/gpu/export.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct MIGRAPHX_GPU_EXPORT problem_cache
+{
+    bool has(const std::string& name, const value& problem) const;
+    void insert(const std::string& name, const value& problem, const value& solution);
+    void mark(const std::string& name, const value& problem);
+    optional<value> get(const std::string& name, const value& problem) const;
+    void load();
+    void save() const;
+    std::unordered_map<value, value> cache;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PROBLEM_CACHE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reduce_op.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reduce_op.hpp
new file mode 100644
index 000000000..10f3dcf84
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reduce_op.hpp
@@ -0,0 +1,81 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_REDUCE_OP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REDUCE_OP_HPP
+
+#include <migraphx/gpu/name.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+template <class Derived, class Op, void (*F)(hipStream_t, const argument&, const argument&)>
+struct reduce_op : oper<Derived>
+{
+    Op op;
+
+    template <class Self, class T>
+    static auto reflect(Self& self, T f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> in_shapes{inputs};
+        in_shapes.pop_back();
+        check_shapes{in_shapes, *this}.standard();
+        return op.normalize_compute_shape(in_shapes);
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        F(ctx.get_stream().get(), args[1], args[0]);
+        return args[1];
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
+    reduce_op() {}
+    reduce_op(const Op& op_ref) : op(op_ref) {}
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reverse.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reverse.hpp
new file mode 100644
index 000000000..8ef825235
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/reverse.hpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_REVERSE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REVERSE_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/reverse.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_reverse
+{
+    op::reverse op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::reverse"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rnn_variable_seq_lens.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rnn_variable_seq_lens.hpp
new file mode 100644
index 000000000..7d811192d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rnn_variable_seq_lens.hpp
@@ -0,0 +1,101 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_RNN_VARIABLE_SEQ_LENS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_RNN_VARIABLE_SEQ_LENS_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/op/rnn_variable_seq_lens.hpp>
+#include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/gpu/device/rnn_variable_seq_lens.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct hip_rnn_var_sl_shift_sequence
+{
+    op::rnn_var_sl_shift_sequence op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::rnn_var_sl_shift_sequence"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+struct hip_rnn_var_sl_shift_output
+{
+    op::rnn_var_sl_shift_output op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::rnn_var_sl_shift_output"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+struct hip_rnn_var_sl_last_output
+{
+    op::rnn_var_sl_last_output op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rocblas.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rocblas.hpp
new file mode 100644
index 000000000..d23c40f9d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/config.hpp>
+#if MIGRAPHX_USE_ROCBLAS
+#include <rocblas/rocblas.h>
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_ROCBLAS
+
+using rocblas_handle_ptr = MIGRAPHX_MANAGE_PTR(rocblas_handle, rocblas_destroy_handle);
+
+rocblas_handle_ptr create_rocblas_handle_ptr();
+rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s);
+#endif
+struct context;
+
+MIGRAPHX_GPU_EXPORT bool get_compute_fp32_flag();
+
+MIGRAPHX_GPU_EXPORT bool rocblas_fp8_available();
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/schedule_model.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/schedule_model.hpp
new file mode 100644
index 000000000..d9c692cb7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/schedule_model.hpp
@@ -0,0 +1,53 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+struct operation;
+
+namespace gpu {
+
+struct schedule_model
+{
+    std::size_t streams = 0;
+    std::size_t concurrency() const;
+    void sched(module& m, instruction_ref ins, std::size_t n) const;
+    void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
+    void record(module& m, instruction_ref ins, std::size_t wait_id) const;
+    std::size_t weight(const operation& op) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/sync_device.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/sync_device.hpp
new file mode 100644
index 000000000..331152cbf
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/sync_device.hpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_SYNC_DEVICE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_SYNC_DEVICE_HPP
+
+#include <string>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+
+namespace gpu {
+
+struct sync_device
+{
+    std::string name() const { return "sync_device"; }
+    void apply(module& m) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/target.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/target.hpp
new file mode 100644
index 000000000..407c44fec
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/target.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_TARGET_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_TARGET_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/compile_options.hpp>
+#include <migraphx/gpu/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const;
+    migraphx::context get_context() const;
+    argument copy_to(const argument& arg) const;
+    argument copy_from(const argument& arg) const;
+    argument allocate(const shape& s) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/time_op.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/time_op.hpp
new file mode 100644
index 000000000..2c5893eed
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/time_op.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP
+#define MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_GPU_EXPORT double
+time_op(const context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
+
+MIGRAPHX_GPU_EXPORT double time_program(const context& ictx, program p, int n = 100);
+
+/* benchmark gpu::code_object with expected input shapes over n iterations */
+MIGRAPHX_GPU_EXPORT double time_op(const context& ictx, operation op, int n = 100);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/topk.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/topk.hpp
new file mode 100644
index 000000000..f1df9d469
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/topk.hpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_TOPK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_TOPK_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/topk.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_topk
+{
+    op::topk op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::topk"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/tuning_config.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/tuning_config.hpp
new file mode 100644
index 000000000..23538c0d3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/tuning_config.hpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct tuning_config
+{
+    value problem;
+    std::vector<value> solutions;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/write_literals.hpp b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/write_literals.hpp
new file mode 100644
index 000000000..85a2ce3a8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/include/migraphx/gpu/write_literals.hpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT write_literals
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::write_literals"; }
+
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/jit/ck_gemm.cpp b/docker/rocm/migraphx/targets/gpu/jit/ck_gemm.cpp
new file mode 100644
index 000000000..392eaa0c6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/ck_gemm.cpp
@@ -0,0 +1,235 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/ck.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const ck_gemm_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/ck_gemm.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <${include}>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        ck_gemm<${solution}, ${blocks_per_batch}>(xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct ck_gemm_compiler : compiler<ck_gemm_compiler>
+{
+    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm"}; }
+
+    ck::host::device_gemm_multiple_d::Problem create_problem(const std::vector<shape>& inputs,
+                                                             const value& v) const
+    {
+        const auto& a_shape = inputs[0];
+        const auto& b_shape = inputs[1];
+        const auto& c_shape = inputs.back();
+
+        // cppcheck-suppress unreadVariable
+        auto rank        = a_shape.ndim();
+        auto batch_count = get_batch_count(c_shape);
+        auto m           = c_shape.lens()[rank - 2];
+        m                = can_fold_batch(inputs) ? m * batch_count : m;
+        auto n           = c_shape.lens().back();
+        auto k           = a_shape.lens().back();
+
+        const bool trans_a = transposed_matrix(a_shape);
+        const bool trans_b = transposed_matrix(b_shape);
+        const bool trans_e = transposed_matrix(c_shape);
+        const auto a_type  = get_type(a_shape);
+        const auto b_type  = get_type(b_shape);
+        const auto e_type  = get_type(c_shape);
+        std::vector<bool> ds_layout;
+        std::transform(inputs.begin() + 2,
+                       inputs.end() - 1,
+                       std::back_inserter(ds_layout),
+                       [](const auto& i) { return transposed_matrix(i); });
+        std::vector<ck::host::DataType> ds_type;
+        std::transform(inputs.begin() + 2,
+                       inputs.end() - 1,
+                       std::back_inserter(ds_type),
+                       [](const auto& i) { return get_type(i); });
+
+        std::string ck_passthrough = "ck_passthrough";
+        std::string cde_op         = ck_passthrough;
+        assert(inputs.size() < 4 or v.contains("post"));
+        if(v.contains("post"))
+        {
+            cde_op = v.at("post").to<std::string>();
+        }
+
+        return ck::host::device_gemm_multiple_d::Problem{m,
+                                                         n,
+                                                         k,
+                                                         trans_a,
+                                                         trans_b,
+                                                         trans_e,
+                                                         ds_layout,
+                                                         a_type,
+                                                         b_type,
+                                                         e_type,
+                                                         ds_type,
+                                                         ck_passthrough,
+                                                         ck_passthrough,
+                                                         cde_op};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        const auto& c_shape = inputs.back();
+        auto tuning_value   = v.get("tuning_value", 34);
+        auto batch_count = get_batch_count(c_shape);
+        auto problem     = create_problem(inputs, v);
+
+        const auto include_header   = problem.GetIncludeHeader();
+        const auto solutions        = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        const auto& solution        = solutions.at(tuning_value);
+        const auto template_str     = solution.template_str;
+        const auto blocks_per_batch = solution.grid_size;
+        const auto block_size       = solution.block_size;
+
+        hip_compile_options options;
+        options.additional_src_files = ck_headers();
+        auto grid_size = can_fold_batch(inputs) ? blocks_per_batch : batch_count * blocks_per_batch;
+        options.set_launch_params(v, grid_size * block_size, block_size);
+        options.inputs         = inputs;
+        options.output         = c_shape;
+        options.kernel_name    = v.get("kernel", "ck_gemm_kernel");
+        options.virtual_inputs = inputs;
+        if(can_fold_batch(inputs))
+        {
+            auto vinputs = inputs;
+            fold_batch_dims(vinputs[0]);
+            remove_batch_dims(vinputs[1]);
+            std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims);
+            options.virtual_inputs = vinputs;
+        }
+
+        if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{}))
+            options.emplace_param("-DMIGRAPHX_CK_CHECK=1");
+
+        auto src = interpolate_string(ck_gemm_kernel,
+                                      {{"solution", template_str},
+                                       {"include", include_header},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"blocks_per_batch", to_string(blocks_per_batch)},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"kernel", options.kernel_name}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    value create_settings(instruction_ref ins, const operation& op) const
+    {
+        auto v      = op.to_value();
+        v["kernel"] = "ck_gemm_kernel";
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_function") +
+                            "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm, post_ck_gemm_function);";
+            v["post"]   = "ck_function_adaptor<post_ck_gemm>";
+            v["kernel"] = to_c_id("ck_gemm_" + generate_name_from_ops(*pm) + "_kernel");
+        }
+        return v;
+    }
+
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = create_settings(ins, op);
+        if(not solution.is_null())
+            v["tuning_value"] = solution;
+        return {compile_op(ctx, shapes, v),
+                [=](module& m, instruction_ref ins2, const operation& code_object) {
+                    if(enabled(MIGRAPHX_LOG_CK_GEMM{}))
+                    {
+                        std::vector<shape> gemm_shapes{
+                            shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())};
+                        std::cout << "gpu::ck_gemm: " << to_json_string(to_value(gemm_shapes))
+                                  << std::endl;
+                    }
+                    m.replace_instruction(ins2, code_object, ins2->inputs());
+                }};
+    }
+
+    optional<tuning_config>
+    get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const
+    {
+        if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{}))
+            return nullopt;
+        tuning_config tc;
+        auto shapes    = to_shapes(ins->inputs());
+        auto problem   = create_problem(shapes, create_settings(ins, op));
+        auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        tc.solutions.resize(solutions.size());
+        std::iota(tc.solutions.begin(), tc.solutions.end(), 0);
+        std::vector<shape> gemm_shapes{shapes[0], shapes[1], shapes.back()};
+        tc.problem = to_value(gemm_shapes);
+        return tc;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/docker/rocm/migraphx/targets/gpu/jit/ck_gemm_softmax_gemm.cpp
new file mode 100644
index 000000000..693153d09
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/ck_gemm_softmax_gemm.cpp
@@ -0,0 +1,236 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/ck.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const ck_gemm_softmax_gemm_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/ck_gemm_softmax_gemm.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <${include}>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        auto settings = make_ck_gemm_softmax_gemm_settings(MIGRAPHX_MAKE_CONSTANT(float{SCALE}));
+        ck_gemm_softmax_gemm<${solution}, ${blocks_per_batch}>(settings, xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct ck_gemm_softmax_gemm_compiler : compiler<ck_gemm_softmax_gemm_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"};
+    }
+
+    ck::host::device_batched_gemm_softmax_gemm::Problem
+    create_problem(const std::vector<shape>& inputs, const value&) const
+    {
+        const auto& a_shape  = inputs[0];
+        const auto& b_shape  = inputs[1];
+        const auto& b1_shape = inputs[2];
+        const auto& c_shape  = inputs.back();
+
+        // cppcheck-suppress unreadVariable
+        auto rank        = a_shape.ndim();
+        auto batch_count = get_batch_count(c_shape);
+        auto m           = c_shape.lens()[rank - 2];
+        m                = can_fold_batch(inputs) ? m * batch_count : m;
+        auto n           = c_shape.lens().back();
+        auto k           = a_shape.lens().back();
+        auto o           = c_shape.lens().back();
+
+        const bool trans_a  = transposed_matrix(a_shape);
+        const bool trans_b  = transposed_matrix(b_shape);
+        const bool trans_b1 = transposed_matrix(b1_shape);
+        const bool trans_c  = transposed_matrix(c_shape);
+        const auto a_type   = get_type(a_shape);
+        const auto b_type   = get_type(b_shape);
+        const auto b1_type  = get_type(b1_shape);
+        const auto c_type   = get_type(c_shape);
+
+        std::string ck_passthrough = "ck_passthrough";
+        return ck::host::device_batched_gemm_softmax_gemm::Problem{m,
+                                                                   n,
+                                                                   k,
+                                                                   o,
+                                                                   trans_a,
+                                                                   trans_b,
+                                                                   trans_b1,
+                                                                   trans_c,
+                                                                   a_type,
+                                                                   b_type,
+                                                                   b1_type,
+                                                                   c_type,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        const auto& c_shape = inputs.back();
+        auto tuning_value   = v.get("tuning_value", 5);
+        auto batch_count    = get_batch_count(c_shape);
+        auto problem        = create_problem(inputs, v);
+
+        const auto include_header   = problem.GetIncludeHeader();
+        const auto solutions        = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        const auto& solution        = solutions.at(tuning_value);
+        const auto template_str     = solution.template_str;
+        const auto blocks_per_batch = solution.grid_size;
+        const auto block_size       = solution.block_size;
+
+        hip_compile_options options;
+        options.additional_src_files = ck_headers();
+        auto grid_size = can_fold_batch(inputs) ? blocks_per_batch : batch_count * blocks_per_batch;
+        options.set_launch_params(v, grid_size * block_size, block_size);
+        options.inputs         = inputs;
+        options.output         = c_shape;
+        options.kernel_name    = v.get("kernel", "ck_gemm_softmax_gemm_kernel");
+        options.virtual_inputs = inputs;
+        if(can_fold_batch(inputs))
+        {
+            auto vinputs = inputs;
+            fold_batch_dims(vinputs[0]);
+            remove_batch_dims(vinputs[1]);
+            std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims);
+            options.virtual_inputs = vinputs;
+        }
+
+        if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{}))
+            options.emplace_param("-DMIGRAPHX_CK_CHECK=1");
+
+        // scale
+        assert(v.contains("scale"));
+        auto scale = v.at("scale").to<float>();
+        options.emplace_param("-DSCALE=" + std::to_string(scale));
+
+        auto src = interpolate_string(ck_gemm_softmax_gemm_kernel,
+                                      {{"solution", template_str},
+                                       {"include", include_header},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"blocks_per_batch", to_string(blocks_per_batch)},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"kernel", options.kernel_name}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    value create_settings(instruction_ref ins, const operation& op) const
+    {
+        auto v      = op.to_value();
+        v["kernel"] = "ck_gemm_softmax_gemm_kernel";
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_softmax_gemm_function") +
+                            "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm_softmax_gemm, "
+                            "post_ck_gemm_softmax_gemm_function);";
+            v["post"]   = "ck_function_adaptor<post_ck_gemm_softmax_gemm>";
+            v["kernel"] = "ck_gemm_softmax_gemm_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return v;
+    }
+
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = create_settings(ins, op);
+        if(not solution.is_null())
+            v["tuning_value"] = solution;
+        return {compile_op(ctx, shapes, v),
+                [=](module& m, instruction_ref ins2, const operation& code_object) {
+                    if(enabled(MIGRAPHX_LOG_CK_GEMM{}))
+                    {
+                        std::vector<shape> gemm_shapes{
+                            shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())};
+                        std::cout << "gpu::ck_gemm_softmax_gemm: "
+                                  << to_json_string(to_value(gemm_shapes)) << std::endl;
+                    }
+                    m.replace_instruction(ins2, code_object, ins2->inputs());
+                }};
+    }
+
+    optional<tuning_config>
+    get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const
+    {
+        if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{}))
+            return nullopt;
+        tuning_config tc;
+        auto shapes    = to_shapes(ins->inputs());
+        auto problem   = create_problem(shapes, create_settings(ins, op));
+        auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        tc.solutions.resize(solutions.size());
+        std::iota(tc.solutions.begin(), tc.solutions.end(), 0);
+        std::vector<shape> gemm_shapes{shapes[0], shapes[1], shapes.back()};
+        tc.problem = to_value(gemm_shapes);
+        return tc;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/compute_attention_probabilities.cpp b/docker/rocm/migraphx/targets/gpu/jit/compute_attention_probabilities.cpp
new file mode 100644
index 000000000..8a0c72207
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/compute_attention_probabilities.cpp
@@ -0,0 +1,114 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/group_query_attention.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const compute_attention_probabilities_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/compute_attention_probabilities.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        
+        compute_attention_probabilities(xs..., make_gqa_parameters(${gqa_params}));
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct compute_attention_probabilities_compiler : compiler<compute_attention_probabilities_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"compute_attention_probabilities", "gpu::compute_attention_probabilities"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto params         = init_params(inputs, v);
+        auto gqa_params_str = params.make_init_str();
+
+        hip_compile_options options;
+        options.set_launch_params(
+            v,
+            compute_global_for(ctx,
+                               params.batch_size * params.num_heads * params.sequence_length *
+                                   params.seqlen_present_kv_cache));
+        options.inputs      = inputs;
+        options.output      = inputs.back();
+        options.kernel_name = v.get("kernel", "compute_attention_probabilities_kernel");
+
+        auto src = interpolate_string(compute_attention_probabilities_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"gqa_params", gqa_params_str},
+                                       {"kernel", options.kernel_name}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = op.to_value();
+        return compile_op(ctx, shapes, v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/compute_attention_scores.cpp b/docker/rocm/migraphx/targets/gpu/jit/compute_attention_scores.cpp
new file mode 100644
index 000000000..a8834a24e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/compute_attention_scores.cpp
@@ -0,0 +1,119 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/group_query_attention.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const compute_attention_scores_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/compute_attention_scores.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+
+
+namespace migraphx {
+
+
+
+extern "C" {
+
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        
+        compute_attention_scores(xs..., make_gqa_parameters(${gqa_params}));
+    });
+}
+
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct compute_attention_scores_compiler : compiler<compute_attention_scores_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"compute_attention_scores", "gpu::compute_attention_scores"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto params         = init_params(inputs, v);
+        auto gqa_params_str = params.make_init_str();
+
+        hip_compile_options options;
+        options.set_launch_params(
+            v,
+            compute_global_for(ctx,
+                               params.batch_size * params.num_heads * params.sequence_length *
+                                   params.head_size));
+        options.inputs      = inputs;
+        options.output      = inputs.back();
+        options.kernel_name = v.get("kernel", "compute_attention_scores_kernel");
+
+        auto src = interpolate_string(compute_attention_scores_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"gqa_params", gqa_params_str},
+                                       {"kernel", options.kernel_name}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = op.to_value();
+        return compile_op(ctx, shapes, v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/concat.cpp b/docker/rocm/migraphx/targets/gpu/jit/concat.cpp
new file mode 100644
index 000000000..322f863e3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/concat.cpp
@@ -0,0 +1,202 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/algorithm.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const concat_kernel = R"__migraphx__(
+#include <migraphx/kernels/concat.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
+{
+    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
+        concat<${axis}>(${concat_args})(${post}, y, xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct concat_compiler : compiler<concat_compiler>
+{
+    std::vector<std::string> names() const { return {"fused_concat", "concat"}; }
+
+    static std::vector<shape> normalize(std::vector<shape> inputs, std::size_t& axis)
+    {
+        auto s = inputs.back();
+        std::vector<std::size_t> strides(s.lens().size());
+        strides[axis] = 1;
+
+        inputs.push_back(shape{s.type(), s.lens(), strides});
+
+        auto result   = reduce_dims(normalize_permutation(inputs));
+        auto rstrides = result.back().strides();
+        auto it = std::find_if(rstrides.begin(), rstrides.end(), [](auto x) { return x == 1; });
+        axis    = it - rstrides.begin();
+        result.pop_back();
+        return result;
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs      = inputs;
+        options.output      = inputs.back();
+        auto concat_axis       = v.at("axis").to<std::size_t>();
+        options.virtual_inputs = normalize(inputs, concat_axis);
+        options.kernel_name = v.get("kernel", "concat_kernel");
+        auto axis              = find_fast_axis(options.virtual_inputs);
+        auto op_names       = v.at("ops").to_vector<std::string>();
+        auto args           = v.at("args");
+        vectorize vec{};
+        if(axis != concat_axis)
+            vec = vectorize::elements(ctx, axis, options.virtual_inputs);
+        auto nelements_per_op = options.virtual_inputs.back().elements() / op_names.size();
+        options.set_launch_params(v, compute_global_for(ctx, nelements_per_op / vec.size, 256));
+        options.emplace_param("-Wno-float-equal");
+        std::vector<std::string> concat_params;
+        std::vector<std::string> concat_args;
+        for(auto i : range(op_names.size()))
+        {
+            const auto& name = op_names[i];
+            auto n           = args.at(name).to<std::size_t>();
+            auto prefix      = to_c_id(name + std::to_string(i) + "_concat_x");
+            transform(range(n), std::back_inserter(concat_params), [&](auto j) {
+                return "auto " + prefix + std::to_string(j);
+            });
+            std::vector<std::string> pack_args = {"MIGRAPHX_LIFT(" + name + ")"};
+            transform(range(n), std::back_inserter(pack_args), [&](auto j) {
+                return prefix + std::to_string(j);
+            });
+            concat_args.push_back("pack(" + join_strings(pack_args, ", ") + ")");
+        }
+        auto src = interpolate_string(concat_kernel,
+                                      {{"kernel", options.kernel_name},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"concat_params", join_strings(concat_params, ", ")},
+                                       {"concat_args", join_strings(concat_args, ", ")},
+                                       {"post", v.get("post", std::string{"op::id{}"})},
+                                       {"transformers", make_transformer_args(vec)},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"axis", std::to_string(concat_axis)}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto v = op.to_value();
+        if(op.name() == "fused_concat")
+        {
+            std::unordered_map<std::string, std::string> mod_names_lookup;
+            transform(range(ins->module_inputs().size()),
+                      std::inserter(mod_names_lookup, mod_names_lookup.end()),
+                      [&](auto i) {
+                          return std::make_pair(ins->module_inputs()[i]->name(),
+                                                "pointwise" + std::to_string(i));
+                      });
+            v["preamble"] = transform_accumulate(
+                ins->module_inputs().begin(),
+                ins->module_inputs().end(),
+                std::string{},
+                std::plus<>{},
+                [&](module_ref mod) {
+                    return generate_pointwise(*mod, mod_names_lookup.at(mod->name())) + "\n";
+                });
+            std::vector<std::string> mod_names;
+            std::transform(ins->module_inputs().begin(),
+                           ins->module_inputs().end() - 1,
+                           std::back_inserter(mod_names),
+                           [&](module_ref mod) { return mod_names_lookup.at(mod->name()); });
+            v["ops"]            = mod_names;
+            module_ref last_mod = ins->module_inputs().back();
+            v["post"]           = "MIGRAPHX_LIFT(" + mod_names_lookup.at(last_mod->name()) + ")";
+            std::unordered_map<std::string, std::size_t> mod_args;
+            std::transform(ins->module_inputs().begin(),
+                           ins->module_inputs().end() - 1,
+                           std::inserter(mod_args, mod_args.end()),
+                           [&](module_ref mod) {
+                               const auto& name = mod_names_lookup.at(mod->name());
+                               return std::make_pair(name, mod->get_parameter_names().size());
+                           });
+            v["args"]        = mod_args;
+            auto prefix_name = transform_accumulate(ins->module_inputs().begin(),
+                                                    ins->module_inputs().end() - 1,
+                                                    std::string{},
+                                                    std::plus<>{},
+                                                    [&](module_ref mod) -> std::string {
+                                                        auto name = generate_name_from_ops(*mod);
+                                                        if(name.empty())
+                                                            return "";
+                                                        return name + "_";
+                                                    });
+            v["kernel"]      = prefix_name + "concat_" +
+                          generate_name_from_ops(*(ins->module_inputs().back())) + "_kernel";
+        }
+        else if(op.name() == "concat")
+        {
+            auto concat_inputs = ins->inputs().size() - 1;
+            if(not ins->module_inputs().empty())
+            {
+                auto* pm      = ins->module_inputs().front();
+                concat_inputs = ins->inputs().size() - pm->get_parameter_names().size();
+                v["preamble"] = generate_pointwise(*pm, "post_concat");
+                v["post"]     = "MIGRAPHX_LIFT(post_concat)";
+                v["kernel"]   = "concat_" + generate_name_from_ops(*pm) + "_kernel";
+            }
+            std::vector<std::string> mod_names(concat_inputs, "op::id{}");
+            v["ops"]                                              = mod_names;
+            std::unordered_map<std::string, std::size_t> mod_args = {{"op::id{}", 1}};
+            v["args"]                                             = mod_args;
+        }
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/concat_past_present.cpp b/docker/rocm/migraphx/targets/gpu/jit/concat_past_present.cpp
new file mode 100644
index 000000000..b18d70108
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/concat_past_present.cpp
@@ -0,0 +1,119 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/group_query_attention.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const concat_past_present_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/concat_past_present.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+
+namespace migraphx {
+
+
+
+extern "C" {
+
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors())(${args})([](auto... xs) {
+        
+        concat_past_present(xs..., make_gqa_parameters(${gqa_params}));
+    });
+}
+
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct concat_past_present_compiler : compiler<concat_past_present_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"concat_past_present", "gpu::concat_past_present"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto params         = init_params(inputs, v);
+        auto gqa_params_str = params.make_init_str();
+
+        hip_compile_options options;
+        options.set_launch_params(v,
+                                  compute_global_for(ctx,
+                                                     2 * params.batch_size * params.kv_num_heads *
+                                                         params.sequence_length *
+                                                         params.head_size));
+        options.inputs      = inputs;
+        options.output      = inputs.front();
+        options.kernel_name = v.get("kernel", "concat_past_present_kernel");
+        options.output_arg  = 0;
+
+        auto src = interpolate_string(concat_past_present_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"gqa_params", gqa_params_str},
+                                       {"kernel", options.kernel_name}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = op.to_value();
+        return compile_op(ctx, shapes, v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/gather.cpp b/docker/rocm/migraphx/targets/gpu/jit/gather.cpp
new file mode 100644
index 000000000..9dc17db09
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/gather.cpp
@@ -0,0 +1,89 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const gather_kernel = R"__migraphx__(
+#include <migraphx/kernels/gather.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void gather_kernel(void* in_data, void* in_indices, void* output) 
+{
+    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
+        gather<${axis}>(xs...); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct gather_compiler : compiler<gather_compiler>
+{
+    std::vector<std::string> names() const { return {"gather"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        const auto& out_s = inputs.back();
+        options.set_launch_params(v, compute_global_for(ctx, out_s.elements()));
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "gather_kernel";
+        options.virtual_inputs = inputs;
+
+        auto axis = v.at("axis").to<std::string>();
+
+        auto src = interpolate_string(gather_kernel, {{"axis", axis}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/gathernd.cpp b/docker/rocm/migraphx/targets/gpu/jit/gathernd.cpp
new file mode 100644
index 000000000..05a48f4e9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/gathernd.cpp
@@ -0,0 +1,91 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const gathernd_kernel = R"__migraphx__(
+#include <migraphx/kernels/gathernd.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void gathernd_kernel(void* in_data, void* in_indices, void* output) 
+{
+    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
+        auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));
+        gathernd(xs..., settings); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct gathernd_compiler : compiler<gathernd_compiler>
+{
+    std::vector<std::string> names() const { return {"gathernd"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        const auto& out_s = inputs.back();
+        options.set_launch_params(v, compute_global_for(ctx, out_s.elements()));
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "gathernd_kernel";
+        options.virtual_inputs = inputs;
+
+        // batch_dims
+        assert(v.contains("batch_dims"));
+        auto batch_dims = v.at("batch_dims").to<int64_t>();
+        options.emplace_param("-DBATCH_DIMS=" + std::to_string(batch_dims));
+
+        return compile_hip_code_object(ctx, gathernd_kernel, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/gqa_rotary_embedding.cpp b/docker/rocm/migraphx/targets/gpu/jit/gqa_rotary_embedding.cpp
new file mode 100644
index 000000000..340616352
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/gqa_rotary_embedding.cpp
@@ -0,0 +1,114 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/group_query_attention.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const gqa_rotary_embedding_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/gqa_rotary_embedding.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+
+namespace migraphx {
+
+
+
+extern "C" {
+
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        
+        gqa_rotary_embedding(xs..., make_gqa_parameters(${gqa_params}));
+    });
+}
+
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct gqa_rotary_embedding_compiler : compiler<gqa_rotary_embedding_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"gqa_rotary_embedding", "gpu::gqa_rotary_embedding"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto params         = init_params(inputs, v);
+        auto gqa_params_str = params.make_init_str();
+
+        hip_compile_options options;
+        options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements()));
+        options.inputs      = inputs;
+        options.output      = inputs.back();
+        options.kernel_name = v.get("kernel", "gqa_rotary_embedding_kernel");
+
+        auto src = interpolate_string(gqa_rotary_embedding_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"gqa_params", gqa_params_str},
+                                       {"kernel", options.kernel_name}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = op.to_value();
+        return compile_op(ctx, shapes, v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/gqa_softmax.cpp b/docker/rocm/migraphx/targets/gpu/jit/gqa_softmax.cpp
new file mode 100644
index 000000000..c1ff241dd
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/gqa_softmax.cpp
@@ -0,0 +1,113 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/group_query_attention.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const gqa_softmax_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/gqa_softmax.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+
+namespace migraphx {
+
+
+
+extern "C" {
+
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        
+        gqa_softmax(xs..., make_gqa_parameters(${gqa_params}));
+    });
+}
+
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct gqa_softmax_compiler : compiler<gqa_softmax_compiler>
+{
+    std::vector<std::string> names() const { return {"gqa_softmax", "gpu::gqa_softmax"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto params         = init_params(inputs, v);
+        auto gqa_params_str = params.make_init_str();
+
+        hip_compile_options options;
+        options.set_launch_params(
+            v,
+            compute_global_for(ctx, params.batch_size * params.num_heads * params.sequence_length));
+        options.inputs      = inputs;
+        options.output      = inputs.back();
+        options.kernel_name = v.get("kernel", "gqa_softmax_kernel");
+
+        auto src = interpolate_string(gqa_softmax_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"gqa_params", gqa_params_str},
+                                       {"kernel", options.kernel_name}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = op.to_value();
+        return compile_op(ctx, shapes, v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/layernorm.cpp b/docker/rocm/migraphx/targets/gpu/jit/layernorm.cpp
new file mode 100644
index 000000000..09736031b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/layernorm.cpp
@@ -0,0 +1,131 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const layernorm_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/layernorm.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/preload.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
+{
+    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto... xs) {
+        ${layernorm}<${axis}>(${post}, ${eps}, xs...);
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct layernorm_compiler : compiler<layernorm_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"layernorm", "gpu::prelayernorm", "gpu::preadd_layernorm"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        // TODO: Use reduce_dims
+        auto axis  = inputs.front().lens().size() - 1;
+        auto faxis = find_fast_axis({inputs.front()});
+        vectorize vec{};
+        // Vectorize if the axis is a reduction axis
+        if(axis == faxis)
+        {
+            vec = vectorize::elements(ctx, faxis, inputs);
+        }
+        auto relements  = inputs[0].lens()[axis] / vec.size;
+        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
+        auto block_size = compute_block_size(ctx, relements, 256);
+        hip_compile_options options;
+        options.set_launch_params(
+            v, compute_global_for(ctx, nelements * block_size, 256), block_size);
+        options.output      = inputs.back();
+        options.inputs      = inputs;
+        options.kernel_name = v.get("kernel", "layernorm_kernel");
+        auto eps            = v.get("epsilon", 1e-12f);
+
+        auto src = interpolate_string(layernorm_kernel,
+                                      {{"kernel", options.kernel_name},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"transformers", make_transformer_args(vec)},
+                                       {"post", v.get("post", std::string{"op::id{}"})},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"layernorm", v.get("layernorm", std::string{"layernorm"})},
+                                       {"axis", to_string(axis)},
+                                       {"eps", to_string(eps)}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        auto v         = op.to_value();
+        v["layernorm"] = "layernorm";
+        v["kernel"]    = "layernorm_kernel";
+        if(op.name() == "gpu::preadd_layernorm")
+        {
+            v["layernorm"] = "add_layernorm";
+            v["kernel"]    = "add_layernorm_kernel";
+        }
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_layernorm");
+            v["post"]     = "MIGRAPHX_LIFT(post_layernorm)";
+            v["kernel"] =
+                v["layernorm"].to<std::string>() + "_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/mlir.cpp b/docker/rocm/migraphx/targets/gpu/jit/mlir.cpp
new file mode 100644
index 000000000..4893743c2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/mlir.cpp
@@ -0,0 +1,283 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <iterator>
+#include <migraphx/builtin.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/gpu/compile_pointwise.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_DUMP_TO_MXR);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_DUMP);
+
+static module create_pointwise_module(module_ref in_mod)
+{
+    module pw_mod;
+    std::unordered_map<instruction_ref, instruction_ref> map_ins;
+    for(auto param : in_mod->get_parameters())
+    {
+        map_ins[param] =
+            pw_mod.add_parameter(any_cast<builtin::param>(param->get_operator()).parameter,
+                                 shape{param->get_shape().type()});
+    }
+    auto return_args = pw_mod.add_instructions(
+        in_mod,
+        &map_ins,
+        [](module& m,
+           instruction_ref ins,
+           const operation& op,
+           const std::vector<instruction_ref>& inputs,
+           const std::vector<module_ref>& mod_args) -> instruction_ref {
+            if(op.name() == "multibroadcast" and inputs.front()->name() == "@literal")
+                return inputs.front();
+            else
+                return m.insert_instruction(ins, op, inputs, mod_args);
+        });
+    pw_mod.add_return(return_args);
+    return pw_mod;
+}
+
+struct mlir_compiler : compiler<mlir_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::mlir_op"}; }
+
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation&, const value& solution) const
+    {
+        auto* smod = ins->module_inputs().front();
+        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
+        auto gemm_like_ins = std::find_if(smod->begin(), smod->end(), [&](const auto& i) {
+            return contains({"dot", "quant_dot", "convolution", "quant_convolution"}, i.name());
+        });
+        auto pointwise_ins = std::find_if(gemm_like_ins, smod->end(), [&](const auto& i) {
+            return i.get_operator().attributes().get("pointwise", false) == true;
+        });
+
+        // check if (a) module is fused (b) contains a "gemm/conv" instruction and (c)
+        // perfConfig can not allow fused module
+        if(gemm_like_ins != smod->end() and pointwise_ins != smod->end() and
+           not is_module_fusible(*smod, ctx, solution))
+        {
+            auto input_args = ins->inputs();
+            // remove alloc buffer
+            input_args.pop_back();
+            auto split_ins = std::prev(pointwise_ins);
+            std::array<module_with_inputs, 2> mod_splits;
+            mod_splits           = smod->split(input_args, {split_ins});
+            auto dot_mlir_inputs = to_shapes(mod_splits[0].inputs);
+            // add alloc for the gemm output
+            dot_mlir_inputs.push_back(mod_splits[0].mod.get_output_shapes().front());
+            mlir_code_object cop1 = compile_mlir(ctx, mod_splits[0].mod, dot_mlir_inputs, solution);
+            auto pw_shapes        = to_shapes(mod_splits[1].inputs);
+            if(mod_splits[1].mod.get_output_shapes().size() == 1)
+            {
+                pw_shapes.push_back(mod_splits[1].mod.get_output_shapes().front());
+            }
+            else
+            {
+                pw_shapes.push_back(shape{mod_splits[1].mod.get_output_shapes()});
+            }
+            assert(pw_shapes.back() == ins->get_shape());
+            auto pw_mod                        = create_pointwise_module(&mod_splits[1].mod);
+            auto cop2                          = compile_pointwise(ctx, pw_shapes, &pw_mod);
+            std::vector<mlir_code_object> cops = {cop1,
+                                                  mlir_code_object{any_cast<code_object_op>(cop2)}};
+            return insert(cops, mod_splits, ins, split_ins);
+        }
+        return insert(compile_mlir(ctx, *smod, to_shapes(ins->inputs()), solution));
+    }
+
+    compiler_replace insert(const mlir_code_object& mco) const
+    {
+        return {std::vector<operation>{mco.cop},
+                [=](module& m, instruction_ref ins, const std::vector<operation>& ops) {
+                    std::vector<instruction_ref> inputs = ins->inputs();
+
+                    // Tuple inputs not supported
+                    assert(std::all_of(inputs.begin(), inputs.end() - 1, [](auto i) {
+                        return i->get_shape().sub_shapes().empty();
+                    }));
+
+                    // Multiple output case (allocate ins will give a tuple)
+                    std::vector<instruction_ref> flat_inputs(inputs);
+                    bool multi_out = not flat_inputs.back()->get_shape().sub_shapes().empty();
+                    if(multi_out)
+                    {
+                        auto allocs = flat_inputs.back();
+                        flat_inputs.pop_back();
+                        auto sub_shape_idx = range(allocs->get_shape().sub_shapes().size());
+                        std::transform(sub_shape_idx.begin(),
+                                       sub_shape_idx.end(),
+                                       std::back_inserter(flat_inputs),
+                                       [&](int i) {
+                                           return m.insert_instruction(
+                                               ins,
+                                               migraphx::make_op("get_tuple_elem", {{"index", i}}),
+                                               allocs);
+                                       });
+                    }
+                    std::vector<instruction_ref> tuple_replacements;
+
+                    for(const auto i : range(mco.prefill_indices.size()))
+                    {
+                        auto prefilled_ins = m.insert_instruction(
+                            ins,
+                            migraphx::make_op("hip::fill", {{"value", mco.prefill_values[i]}}),
+                            flat_inputs[mco.prefill_indices[i]]);
+                        if(not multi_out or mco.prefill_indices[i] < inputs.size() - 1)
+                        {
+                            replace(inputs, inputs[mco.prefill_indices[i]], prefilled_ins);
+                        }
+                        else
+                        {
+                            tuple_replacements.push_back(prefilled_ins);
+                        }
+                    }
+
+                    if(multi_out and not tuple_replacements.empty())
+                    {
+                        // Add identity to make sure fill operations happen before kernel call
+                        tuple_replacements.insert(tuple_replacements.begin(), inputs.back());
+                        inputs.back() = m.insert_instruction(
+                            ins, migraphx::make_op("identity"), tuple_replacements);
+                    }
+
+                    auto mlir = insert_mlir(m, ins, any_cast<code_object_op>(ops.front()), inputs);
+                    return m.replace_instruction(ins, mlir);
+                },
+                &trace};
+    }
+
+    compiler_replace insert(const std::vector<mlir_code_object>& mcos,
+                            const std::array<module_with_inputs, 2>& mods,
+                            instruction_ref precompile_ins,
+                            instruction_ref split_ins) const
+    {
+        std::vector<operation> cobjs(mcos.size());
+        std::transform(
+            mcos.begin(), mcos.end(), cobjs.begin(), [](const auto& mco) { return mco.cop; });
+        auto precompiled_inputs = precompile_ins->inputs();
+        return {
+            cobjs, [=](module& m, instruction_ref ins, const std::vector<operation>& ops) {
+                auto compiled_inputs = ins->inputs();
+                std::unordered_map<instruction_ref, instruction_ref> inputs_rep_map;
+                for(const auto i : range(precompiled_inputs.size()))
+                {
+                    inputs_rep_map[precompiled_inputs[i]] = compiled_inputs[i];
+                }
+                auto dot_inputs        = mods[0].inputs;
+                auto dot_mod_out_shape = mods[0].mod.get_output_shapes().front();
+                auto dot_alloc         = m.insert_instruction(
+                    ins,
+                    migraphx::make_op("hip::allocate", {{"shape", to_value(dot_mod_out_shape)}}));
+                dot_inputs.push_back(dot_alloc);
+                for(const auto i : range(mcos[0].prefill_indices.size()))
+                {
+                    auto prefilled_ins = m.insert_instruction(
+                        ins,
+                        migraphx::make_op("hip::fill", {{"value", mcos[0].prefill_values[i]}}),
+                        dot_inputs[mcos[0].prefill_indices[i]]);
+                    replace(dot_inputs, dot_inputs[mcos[0].prefill_indices[i]], prefilled_ins);
+                }
+
+                std::vector<instruction_ref> dot_inputs_updated;
+                std::transform(dot_inputs.begin(),
+                               dot_inputs.end(),
+                               std::back_inserter(dot_inputs_updated),
+                               [&](const auto& i) {
+                                   if(inputs_rep_map.find(i) != inputs_rep_map.end())
+                                   {
+                                       assert(inputs_rep_map.at(i)->get_shape() == i->get_shape());
+                                       return inputs_rep_map.at(i);
+                                   }
+                                   return i;
+                               });
+                auto mlir_ins =
+                    insert_mlir(m, ins, any_cast<code_object_op>(ops[0]), dot_inputs_updated);
+                auto pwm = mods[1];
+                pwm.replace(split_ins, mlir_ins);
+                auto pw_inputs = pwm.inputs;
+                pw_inputs.push_back(ins->inputs().back());
+                std::vector<instruction_ref> pw_inputs_updated;
+                std::transform(pw_inputs.begin(),
+                               pw_inputs.end(),
+                               std::back_inserter(pw_inputs_updated),
+                               [&](const auto& i) {
+                                   if(inputs_rep_map.find(i) != inputs_rep_map.end())
+                                   {
+                                       assert(inputs_rep_map.at(i)->get_shape() == i->get_shape());
+                                       return inputs_rep_map.at(i);
+                                   }
+                                   return i;
+                               });
+                auto pw_ins =
+                    insert_mlir(m, ins, any_cast<code_object_op>(ops[1]), pw_inputs_updated);
+                return m.replace_instruction(ins, pw_ins);
+            }};
+    }
+
+    optional<tuning_config> get_tuning_config(const context& ctx,
+                                              instruction_ref ins,
+                                              const operation&,
+                                              bool exhaustive) const
+    {
+        static const auto mxr_loc  = string_value_of(MIGRAPHX_MLIR_DUMP_TO_MXR{});
+        static const auto mlir_loc = string_value_of(MIGRAPHX_MLIR_DUMP{});
+
+        auto shapes = to_shapes(ins->inputs());
+        auto* smod  = ins->module_inputs().front();
+        if(not mxr_loc.empty())
+        {
+            dump_mlir_to_mxr(*smod, ins->inputs(), mxr_loc);
+        }
+        if(not mlir_loc.empty())
+        {
+            dump_mlir_to_file(*smod, shapes, mlir_loc);
+        }
+        return get_tuning_config_mlir(ctx, *smod, shapes, exhaustive);
+    }
+
+    static void trace(std::ostream& os, instruction_ref ins)
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto* smod  = ins->module_inputs().front();
+        os << dump_mlir(*smod, shapes);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/pad.cpp b/docker/rocm/migraphx/targets/gpu/jit/pad.cpp
new file mode 100644
index 000000000..9ea77ee99
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/pad.cpp
@@ -0,0 +1,121 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/pad.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+MIGRAPHX_GLOBAL void pad_kernel(void* input_p, void* output_p) 
+{
+    auto offsets = index_ints<${offsets}>{};
+    auto idx     = make_index();
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
+        pad(idx, offsets, input, output, ${pad_val});
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct pad_compiler : compiler<pad_compiler>
+{
+    std::vector<std::string> names() const { return {"pad"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto padding    = v.at("pads").to_vector<int64_t>();
+        auto input_lens = inputs.front().lens();
+        std::vector<size_t> offsets(input_lens.size());
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());
+
+        auto offset_lens = input_lens;
+        std::transform(input_lens.begin(),
+                       input_lens.end(),
+                       offsets.begin(),
+                       offset_lens.begin(),
+                       [&](auto input, auto offset) { return input + offset; });
+
+        auto vinputs = inputs;
+        vinputs.push_back(inputs.front().with_lens(offset_lens));
+        auto rinputs = reduce_dims(normalize_permutation(vinputs));
+
+        auto rinput_lens  = rinputs.front().lens();
+        auto roffset_lens = rinputs.back().lens();
+        std::vector<size_t> roffsets(roffset_lens.size());
+        std::transform(rinput_lens.begin(),
+                       rinput_lens.end(),
+                       roffset_lens.begin(),
+                       roffsets.begin(),
+                       [](auto input, auto offset_dim) { return offset_dim - input; });
+        rinputs.pop_back();
+
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = rinputs;
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
+
+        auto pad_val        = v.get("value", 0.f);
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";
+
+        auto src = interpolate_string(
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(roffsets)}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/pointwise.cpp b/docker/rocm/migraphx/targets/gpu/jit/pointwise.cpp
new file mode 100644
index 000000000..9e352888e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/pointwise.cpp
@@ -0,0 +1,122 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_pointwise.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
+{
+    auto idx = make_index();
+    pointwise<${noutputs}, ${tiled}>(idx, ${transformers})(${lambda}, ${args});
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct pointwise_compiler : compiler<pointwise_compiler>
+{
+    std::vector<std::string> names() const { return {"pointwise", "contiguous", "layout"}; }
+
+    static std::size_t oversubscribe_if(bool b)
+    {
+        if(b)
+            return 256;
+        else
+            return 1;
+    }
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = flatten(inputs);
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(normalize_permutation(options.inputs));
+        options.emplace_param("-Wno-float-equal");
+        auto axis              = find_fast_axis(options.virtual_inputs);
+        auto vec               = vectorize::elements(ctx, axis, options.virtual_inputs);
+        options.kernel_name    = v.get("kernel", "kernel");
+        auto noutputs = options.inputs.size() - inputs.size() + 1;
+        auto t                 = tile::elements(options.virtual_inputs, noutputs);
+        // auto t = tile{};
+        if(t.ntiles == 0)
+            options.set_launch_params(
+                v, compute_global_for(ctx, options.inputs.front().elements() / vec.size, 256));
+        else
+            options.set_launch_params(
+                v, compute_global_for(ctx, t.ntiles * t.block_size, 256), t.block_size);
+        auto src =
+            interpolate_string(pointwise_kernel,
+                               {{"kernel", options.kernel_name},
+                                {"params", enum_params(options.inputs.size(), "void * private_p")},
+                                {"args", enum_params(options.inputs.size(), "private_p")},
+                                {"lambda", v.at("lambda").to<std::string>()},
+                                {"transformers", make_transformer_args(t, vec)},
+                                {"tiled", t.ntiles > 0 ? "true" : "false"},
+                                {"noutputs", std::to_string(noutputs)},
+                                {"preamble", v.get("preamble", std::string{})}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        if(contains({"layout", "contiguous"}, op.name()))
+        {
+            return compile_op(ctx,
+                              to_shapes(ins->inputs()),
+                              {{"lambda", "[](auto x) { return make_tuple(x); }"},
+                               {"kernel", op.name() + "_kernel"}});
+        }
+        else
+        {
+            assert(not ins->module_inputs().empty());
+            const_module_ref pm = ins->module_inputs().front();
+            return compile_pointwise(ctx, to_shapes(ins->inputs()), pm);
+        }
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/pooling.cpp b/docker/rocm/migraphx/targets/gpu/jit/pooling.cpp
new file mode 100644
index 000000000..f245a2269
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/pooling.cpp
@@ -0,0 +1,193 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/op/common.hpp>
+
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const pooling_kernel = R"__migraphx__(
+#include <migraphx/kernels/pooling.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void pooling_kernel(void* in_data, void* output)
+{
+    transform_args(make_tensors(), rotate_last())(in_data, output)([](auto&&... xs) {
+        pooling<${algo}, ${group_size}>(${op}, make_window(index_ints<${window}>{}, index_ints<${stride}>{}, index_ints<${padding}>{}), xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct pooling_compiler : compiler<pooling_compiler>
+{
+
+    static std::size_t compute_subwave_size(context& ctx, std::size_t n)
+    {
+        std::size_t max_wavefront_size = ctx.get_current_device().get_wavefront_size();
+        std::size_t wavefront_size     = 1;
+        while(wavefront_size <= n and wavefront_size < max_wavefront_size)
+            wavefront_size *= 2;
+        return wavefront_size / 2;
+    }
+
+    struct algorithm
+    {
+        std::string name        = "reduce::lane";
+        std::size_t reduce_size = 1;
+        std::size_t block_size  = 256;
+        std::size_t group_size  = 1;
+
+        static std::size_t compute_group_size(const shape& output)
+        {
+            auto n                           = output.lens().back();
+            const std::size_t max_group_size = 32;
+            std::size_t group_size           = 1;
+            while((n % (group_size * 2) == 0) and group_size <= max_group_size)
+                group_size *= 2;
+            return group_size;
+        }
+
+        algorithm() {}
+
+        algorithm(context& ctx, const shape& input, const std::vector<std::size_t>& window)
+        {
+            if(input.strides().back() != 1)
+                return;
+            std::size_t max_wavefront_size = ctx.get_current_device().get_wavefront_size();
+            auto wsize                     = window.back();
+            if(wsize > max_wavefront_size)
+            {
+                block_size  = compute_block_size(ctx, wsize, 256);
+                reduce_size = block_size;
+                name        = "reduce::block";
+            }
+            else
+            {
+                block_size  = max_wavefront_size;
+                reduce_size = compute_subwave_size(ctx, wsize);
+                name        = "reduce::subwave<" + to_string(reduce_size) + ">";
+            }
+        }
+    };
+
+    template <class... Ts>
+    static void normalize(std::vector<shape>& inputs, Ts&... xs)
+    {
+        auto perm = find_permutation(inputs);
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto s) {
+            return reorder_shape(s, perm);
+        });
+        each_args([&](auto& dims) { dims = reorder_dims(dims, perm); }, xs...);
+    }
+
+    std::vector<std::string> names() const { return {"pooling"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "pooling_kernel";
+        options.virtual_inputs = inputs;
+
+        auto ndim      = out_s.ndim();
+        auto pool_ndim = ndim - 2;
+
+        auto read_value = [&](const std::string& name, std::size_t def) {
+            if(v.contains(name))
+            {
+                std::vector<std::size_t> result(2, def);
+                auto x = v.at(name).to_vector<std::size_t>();
+                if(x.size() >= pool_ndim)
+                    result.insert(result.end(), x.begin(), x.begin() + pool_ndim);
+                return result;
+            }
+            else
+            {
+                std::vector<std::size_t> result(ndim, def);
+                return result;
+            }
+        };
+
+        auto padding = read_value("padding", 0);
+        auto stride  = read_value("stride", 1);
+        auto window  = read_value("lengths", 1);
+
+        const auto& mode_v = v.at("mode");
+        std::string mode =
+            mode_v.is_string() ? mode_v.get_string() : to_string(mode_v.to<op::pooling_mode>());
+        bool count_include_pad = v.get("count_include_pad", false);
+        if(count_include_pad and mode == "average")
+            mode = "average_include_pad";
+
+        std::string op = mode + "_pool";
+        if(mode == "lpnorm")
+            op += "<" + v.at("lp_order").to<std::string>() + ">";
+
+        algorithm algo{};
+        options.set_launch_params(
+            v,
+            compute_global_for(ctx, (out_s.elements() / algo.group_size) * algo.reduce_size, 256),
+            algo.block_size);
+        normalize(options.virtual_inputs, padding, stride, window);
+        auto src = interpolate_string(pooling_kernel,
+                                      {{"op", op + "{}"},
+                                       {"algo", algo.name},
+                                       {"group_size", to_string(algo.group_size)},
+                                       {"window", to_string_range(window)},
+                                       {"stride", to_string_range(stride)},
+                                       {"padding", to_string_range(padding)}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/reduce.cpp b/docker/rocm/migraphx/targets/gpu/jit/reduce.cpp
new file mode 100644
index 000000000..bdf7313f5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/reduce.cpp
@@ -0,0 +1,408 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/array.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const simple_reduce_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+MIGRAPHX_GLOBAL void reduce_kernel(void* input_p, void* output_p) 
+{
+    
+    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
+
+        simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+static std::vector<std::size_t> get_reduce_lens(const std::vector<std::size_t>& input_lens,
+                                                const std::vector<std::size_t>& output_lens)
+{
+    std::vector<std::size_t> reduce_lens;
+    std::transform(output_lens.begin(),
+                   output_lens.end(),
+                   input_lens.begin(),
+                   std::back_inserter(reduce_lens),
+                   [](auto x, auto y) -> std::size_t {
+                       if(x == y)
+                           return 1;
+                       else
+                           return y;
+                   });
+    return reduce_lens;
+}
+
+template <class T>
+static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
+{
+    auto lens = s.lens();
+    std::fill(lens.begin(), lens.end(), 1);
+    for(const auto& axis : axes)
+        lens[axis] = s.lens()[axis];
+    return s.with_lens(lens);
+}
+
+template <class T>
+static shape get_output_shape(const shape& s, const std::vector<T>& axes)
+{
+    auto lens = s.lens();
+    for(const auto& axis : axes)
+        lens[axis] = 1;
+    return s.with_lens(lens);
+}
+
+template <class ReduceLens>
+static std::string get_reduce_algo(context& ctx, const std::vector<shape>& inputs, ReduceLens rlens)
+{
+    const auto init = std::numeric_limits<std::size_t>::max();
+    auto relements  = std::accumulate(rlens.begin(), rlens.end(), 1, std::multiplies<>{});
+    // The minimum stride
+    auto min_stride = std::inner_product(
+        rlens.begin(),
+        rlens.end(),
+        inputs.front().strides().begin(),
+        init,
+        [](auto x, auto y) { return std::min(x, y); },
+        [](auto len, auto stride) { return len == 1 ? init : stride; });
+    if(min_stride > 2)
+        return "lane";
+    if(relements <= ctx.get_current_device().get_wavefront_size())
+        return "wave";
+    return "block";
+}
+
+static std::string get_reduce_algo(context& ctx, const std::vector<shape>& inputs)
+{
+    auto rlens = get_reduce_lens(inputs.front().lens(), inputs.back().lens());
+    return get_reduce_algo(ctx, inputs, rlens);
+}
+
+static std::size_t compute_subwave_size(context& ctx, std::size_t n)
+{
+    std::size_t max_wavefront_size = ctx.get_current_device().get_wavefront_size();
+    std::size_t wavefront_size     = 1;
+    while(wavefront_size <= n and wavefront_size < max_wavefront_size)
+        wavefront_size *= 2;
+    return wavefront_size;
+}
+
+/// This will adjust the input shapes so a partial reduction is done per workgroup.
+/// This is done by splitting the reduction axis so each split group becomes
+/// part of the batch. So if we want to do a split redution of a tensor
+/// {K}, then this will create a tensor of {K/N, N} where N is the number of
+/// split groups. To compute the number of split groups it finds the largest
+/// divisor that can divide K to make it less than min_size.
+static std::vector<shape> split_reduce(const std::vector<shape>& inputs,
+                                       std::size_t min_size = 1024)
+{
+    std::vector<shape> result;
+    auto input_shape         = inputs.front();
+    const auto& reduce_shape = inputs[inputs.size() - 2];
+    const auto& output_shape = inputs[inputs.size() - 1];
+
+    auto is          = range(reduce_shape.lens().size());
+    using array_type = std::array<std::size_t, 2>;
+    auto initial     = array_type{std::numeric_limits<std::size_t>::max(),
+                              std::numeric_limits<std::size_t>::max()};
+    auto faxis       = transform_accumulate(
+        is.begin(), is.end(), initial, MIGRAPHX_LIFT(std::min), [&](auto i) -> array_type {
+            if(input_shape.lens()[i] == output_shape.lens()[i])
+                return initial;
+            return {input_shape.strides()[i], std::size_t(i)};
+        })[1];
+
+    assert(faxis < reduce_shape.lens().size());
+
+    std::size_t n = 1;
+    auto r        = input_shape.lens()[faxis];
+    auto factors  = make_array(2, 3, 5, 7, 11);
+    while(r > min_size)
+    {
+        // NOLINTNEXTLINE(readability-qualified-auto)
+        auto it = std::find_if(factors.begin(), factors.end(), [&](auto d) { return r % d == 0; });
+        if(it == factors.end())
+            break;
+        r /= *it;
+        n *= *it;
+    }
+    assert(n != 1);
+    std::transform(
+        inputs.begin(), inputs.end(), std::back_inserter(result), [&](const shape& s) -> shape {
+            auto lens    = s.lens();
+            auto strides = s.strides();
+
+            lens.push_back(n);
+            if(lens[faxis] == 1)
+            {
+                strides.push_back(0);
+            }
+            else
+            {
+                lens[faxis] /= n;
+                strides.push_back(strides[faxis] * lens[faxis]);
+            }
+
+            return {s.type(), lens, strides};
+        });
+    return reduce_dims(normalize_permutation(result));
+}
+
+struct simple_reduce_compiler : compiler<simple_reduce_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"simple_reduce",
+                "reduce_sum",
+                "reduce_mean",
+                "reduce_max",
+                "reduce_min",
+                "reduce_prod",
+                "reduce_any",
+                "reduce_all"};
+    }
+
+    static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
+    {
+        return inputs.front().elements() / inputs.back().elements();
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        auto faxis             = find_fast_axis({options.virtual_inputs.front()});
+        vectorize vec{};
+        auto nelements = options.virtual_inputs.back().elements();
+        auto algo      = v.get("algo", get_reduce_algo(ctx, options.virtual_inputs));
+        if(algo == "block" or algo == "wave")
+        {
+            // Vectorize if the axis is a reduction axis
+            if(options.virtual_inputs.back().lens()[faxis] == 1)
+                vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
+            auto relements  = get_reduce_elements(options.virtual_inputs) / vec.size;
+            if(algo == "block")
+            {
+                auto block_size = compute_block_size(ctx, relements, 256);
+                if(relements >= block_size * 256)
+                    algo = "block_large";
+                options.set_launch_params(
+                    v, compute_global_for(ctx, nelements * block_size, 256), block_size);
+            }
+            else
+            {
+                auto subwave_size = compute_subwave_size(ctx, relements);
+                algo              = "subwave<" + std::to_string(subwave_size) + ">";
+                options.set_launch_params(v,
+                                          compute_global_for(ctx, nelements * subwave_size, 256),
+                                          ctx.get_current_device().get_wavefront_size());
+            }
+        }
+        else if(algo == "lane")
+        {
+            options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
+        }
+        else
+        {
+            MIGRAPHX_THROW("Unknown reduce algo: " + algo);
+        }
+        options.kernel_name  = "reduce_kernel";
+        std::string identity = "[](auto x) { return x; }";
+        auto src             = interpolate_string(simple_reduce_kernel,
+                                      {{"reduction", v.at("reduction").to<std::string>()},
+                                       {"init", v.get("init", std::string{"0"})},
+                                       {"read", v.get("read", identity)},
+                                       {"write", v.get("write", identity)},
+                                       {"algo", algo},
+                                       {"transformers", make_transformer_args(vec)},
+                                       {"preamble", v.get("preamble", std::string{})}});
+        options.emplace_param("-Wno-float-equal");
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        value v = value::object{};
+        reduce_op r{};
+        r.set(ins, op);
+        v["reduction"] = r.reduction;
+        v["read"]      = r.read;
+        v["write"]     = r.write;
+        v["init"]      = r.init;
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
+    }
+};
+
+static const char* const fused_reduce_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), ${transformers}, rotate_and_pack_last<${noutputs}>())(${args})([](auto y, auto... xs) {
+        fused_reduce<reduce::${algo}, ${reduced}>(y, ${assign}{}, partial(${lambda})(xs...));
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct fused_reduce_compiler : compiler<fused_reduce_compiler>
+{
+    std::vector<std::string> names() const { return {"fused_reduce", "split_fused_reduce"}; }
+
+    static shape get_input_shape(const std::vector<shape>& inputs)
+    {
+        auto it = std::max_element(inputs.begin(),
+                                   inputs.end(),
+                                   by(std::less<>{}, [](const shape& s) { return s.elements(); }));
+        return *it;
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        auto assign         = v.get("assign", "assign_none");
+        auto axes           = v.at("axes").to_vector<std::size_t>();
+        auto finputs        = flatten(inputs);
+        auto noutputs       = finputs.size() - inputs.size() + 1;
+        auto virtual_inputs = finputs;
+        virtual_inputs.push_back(get_reduced_shape(get_input_shape(finputs), axes));
+        virtual_inputs.push_back(get_output_shape(get_input_shape(finputs), axes));
+        virtual_inputs           = reduce_dims(normalize_permutation(virtual_inputs));
+        if(assign != "assign_none")
+            virtual_inputs = split_reduce(virtual_inputs);
+        auto reduce_output_shape = virtual_inputs.back();
+        virtual_inputs.pop_back();
+        auto reduction_shape = virtual_inputs.back();
+        virtual_inputs.pop_back();
+
+        hip_compile_options options;
+        options.inputs         = finputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = virtual_inputs;
+        auto faxis             = find_fast_axis({options.virtual_inputs.front()});
+        vectorize vec{};
+        auto nelements = reduce_output_shape.elements();
+        auto algo =
+            v.get("algo", get_reduce_algo(ctx, options.virtual_inputs, reduction_shape.lens()));
+        if(algo == "block" or algo == "wave")
+        {
+            // Vectorize if the axis is a reduction axis
+            if(reduce_output_shape.lens()[faxis] == 1)
+                vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
+            auto relements  = reduction_shape.elements() / vec.size;
+            if(algo == "block")
+            {
+                auto block_size = compute_block_size(ctx, relements, 256);
+                if(relements >= block_size * 256)
+                    algo = "block_large";
+                options.set_launch_params(
+                    v, compute_global_for(ctx, nelements * block_size, 256), block_size);
+            }
+            else
+            {
+                auto subwave_size = compute_subwave_size(ctx, relements);
+                algo              = "subwave<" + std::to_string(subwave_size) + ">";
+                options.set_launch_params(v,
+                                          compute_global_for(ctx, nelements * subwave_size, 256),
+                                          ctx.get_current_device().get_wavefront_size());
+            }
+        }
+        else if(algo == "lane")
+        {
+            options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
+        }
+        else
+        {
+            MIGRAPHX_THROW("Unknown reduce algo: " + algo);
+        }
+        options.kernel_name = v.get("kernel", "reduce_kernel");
+        auto src            = interpolate_string(
+            fused_reduce_kernel,
+            {{"kernel", options.kernel_name},
+                        {"params", enum_params(finputs.size(), "void * private_p")},
+                        {"args", enum_params(finputs.size(), "private_p")},
+                        {"assign", assign},
+                        {"algo", algo},
+                        {"reduced", "decltype(" + generate_make_shape(reduce_output_shape) + ")"},
+                        {"lambda", v.at("lambda").to<std::string>()},
+                        {"transformers", make_transformer_args(vec)},
+                        {"noutputs", std::to_string(noutputs)},
+                        {"preamble", v.get("preamble", std::string{})}});
+        options.emplace_param("-Wno-float-equal");
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        assert(not ins->module_inputs().empty());
+        auto v        = op.to_value();
+        auto* rm      = ins->module_inputs().front();
+        v["preamble"] = generate_reduce(*rm, "fused_reduce_op");
+        v["lambda"]   = "MIGRAPHX_LIFT(fused_reduce_op)";
+        v["kernel"]   = generate_name_from_ops(*rm) + "_kernel";
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/roialign.cpp b/docker/rocm/migraphx/targets/gpu/jit/roialign.cpp
new file mode 100644
index 000000000..aeaf7a858
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/roialign.cpp
@@ -0,0 +1,104 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#if !MIGRAPHX_USE_MIOPEN
+#include <migraphx/op/pooling.hpp>
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const roialign_kernel = R"__migraphx__(
+#include <migraphx/kernels/roialign.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
+{
+    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) {
+        auto settings = make_roalign_settings(MIGRAPHX_MAKE_CONSTANT(float{ROIS_OFFSET}),
+                                              _c<bool{IS_AVG_POOLING}>,
+                                              _c<int64_t{SAMPLING_RATIO}>, 
+                                              MIGRAPHX_MAKE_CONSTANT(float{SPATIAL_SCALE}));
+        roialign(xs..., settings); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct roialign_compiler : compiler<roialign_compiler>
+{
+    std::vector<std::string> names() const { return {"roialign"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements()), 128);
+        options.output      = inputs.back();
+        options.inputs      = inputs;
+        options.kernel_name = "roialign_kernel";
+
+        // sampling_ratio
+        options.emplace_param("-DSAMPLING_RATIO=" + v.at("sampling_ratio").to<std::string>());
+
+        // pooling_mode
+        auto mode = v.at("mode").to<migraphx::op::pooling_mode>();
+        std::string is_avg_pooling =
+            (mode == migraphx::op::pooling_mode::average) ? "true" : "false";
+        options.emplace_param("-DIS_AVG_POOLING=" + is_avg_pooling);
+
+        // coord_trans_mode
+        auto ctm          = v.at("coordinate_transformation_mode").to<std::string>();
+        float rois_offset = (ctm == "half_pixel") ? -0.5f : 0.0f;
+        options.emplace_param("-DROIS_OFFSET=" + std::to_string(rois_offset));
+
+        // spatial_scale
+        options.emplace_param("-DSPATIAL_SCALE=" + v.at("spatial_scale").to<std::string>());
+
+        return compile_hip_code_object(ctx, roialign_kernel, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/scatter.cpp b/docker/rocm/migraphx/targets/gpu/jit/scatter.cpp
new file mode 100644
index 000000000..1a1264f23
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/scatter.cpp
@@ -0,0 +1,78 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "scatter.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const scatter_elements_kernel = R"__migraphx__(
+#include <migraphx/kernels/scatter.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void scatter_elements_kernel(void* in_indices, void* in_updates, void* output) 
+{
+    make_tensors()(in_indices, in_updates, output)([](auto&&... xs) { 
+        scatter<${axis}, ${skip_out_of_bounds}>(xs..., ${reduction}{}); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct scatter_elements_compiler : scatter_compiler<scatter_elements_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"scatter_none", "scatter_add", "scatter_mul", "scatter_min", "scatter_max"};
+    }
+
+    std::string make_interpolated_string(const operation& op) const
+    {
+        const auto reduction = op.name().substr(std::char_traits<char>::length("scatter_"));
+        auto axis            = std::to_string(op.to_value().get("axis", 0));
+        auto skip_out_of_bounds = std::to_string(op.to_value().get("skip_out_of_bounds", 0));
+
+        return interpolate_string(scatter_elements_kernel,
+                                  {{"reduction", "assign_" + reduction},
+                                   {"axis", axis},
+                                   {"skip_out_of_bounds", skip_out_of_bounds}});
+    }
+
+    std::string get_kernel_name(const operation&) const { return "scatter_elements_kernel"; }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/scatter.hpp b/docker/rocm/migraphx/targets/gpu/jit/scatter.hpp
new file mode 100644
index 000000000..6fb955647
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/scatter.hpp
@@ -0,0 +1,81 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_JIT_SCATTER_HPP
+#define MIGRAPHX_GUARD_JIT_SCATTER_HPP
+
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+template <typename Derived>
+struct scatter_compiler : compiler<Derived>
+{
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        const auto inputs =
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()});
+
+        hip_compile_options options;
+        options.set_launch_params(op.to_value(), compute_global_for(ctx, inputs.at(1).elements()));
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.kernel_name    = derived().get_kernel_name(op);
+        options.virtual_inputs = inputs;
+        options.emplace_param("-DMIGRAPHX_ALLOW_ATOMIC_CAS=1");
+
+        const auto src = derived().make_interpolated_string(op);
+        return prepend_copy_data_to_output(compile_hip_code_object(ctx, src, options));
+    }
+
+    // ONNX spec states the following for ScatterElements and ScatterND:
+    // "The output of the operation is produced by creating a copy of the input data, ..."
+    // The sole responsibility of the MIGraphX Scatter operator implementations being to perform the
+    // update operations as specified by ONNX, it is necessary to place the copying of the input
+    // data before the MIGraphX operator in the graph.
+    compiler_replace prepend_copy_data_to_output(const operation& co) const
+    {
+        return {co, [](module& m, instruction_ref ins, const operation& op) {
+                    auto args = ins->inputs();
+                    args.back() =
+                        m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
+                    args.erase(args.begin());
+                    return m.replace_instruction(ins, op, args);
+                }};
+    }
+
+    std::string get_kernel_name(const operation& op) const { return op.name() + "_kernel"; }
+
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/jit/scatternd.cpp b/docker/rocm/migraphx/targets/gpu/jit/scatternd.cpp
new file mode 100644
index 000000000..cf3ab00ed
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/scatternd.cpp
@@ -0,0 +1,73 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "scatter.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const scatternd_kernel = R"__migraphx__(
+#include <migraphx/kernels/scatternd.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void scatternd_kernel(void* in_indices, void* in_updates, void* output) 
+{
+    make_tensors()(in_indices, in_updates, output)([](auto&&... xs) { 
+        scatternd(xs..., ${reduction}{}); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct scatternd_compiler : scatter_compiler<scatternd_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {
+            "scatternd_none", "scatternd_add", "scatternd_mul", "scatternd_min", "scatternd_max"};
+    }
+
+    std::string make_interpolated_string(const operation& op) const
+    {
+        const auto reduction = op.name().substr(std::char_traits<char>::length("scatternd_"));
+        return interpolate_string(scatternd_kernel, {{"reduction", "assign_" + reduction}});
+    }
+
+    std::string get_kernel_name(const operation&) const { return "scatternd_kernel"; }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/softmax.cpp b/docker/rocm/migraphx/targets/gpu/jit/softmax.cpp
new file mode 100644
index 000000000..d2e24a233
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/softmax.cpp
@@ -0,0 +1,104 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_FAST_SOFTMAX)
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const softmax_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/softmax.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+MIGRAPHX_GLOBAL void softmax_kernel(void* input_p, void* output_p) 
+{
+    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
+        softmax<${axis}>(input, output);
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct softmax_compiler : compiler<softmax_compiler>
+{
+    std::vector<std::string> names() const { return {"softmax"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        // TODO: Use reduce_dims
+        auto axis  = v.at("axis").to<int64_t>();
+        auto faxis = find_fast_axis({inputs.front()});
+        vectorize vec{};
+        // Vectorize if the axis is a reduction axis
+        if(faxis == axis)
+        {
+            vec = vectorize::elements(ctx, faxis, inputs);
+        }
+        auto relements  = inputs[0].lens()[axis] / vec.size;
+        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
+        auto block_size = compute_block_size(ctx, relements, 256);
+        hip_compile_options options;
+        options.set_launch_params(
+            v, compute_global_for(ctx, nelements * block_size, 256), block_size);
+        options.output      = inputs.back();
+        options.inputs      = inputs;
+        options.kernel_name = "softmax_kernel";
+
+        if(enabled(MIGRAPHX_USE_FAST_SOFTMAX{}))
+            options.emplace_param("-DMIGRAPHX_USE_FAST_SOFTMAX");
+
+        auto src = interpolate_string(
+            softmax_kernel,
+            {{"transformers", make_transformer_args(vec)}, {"axis", to_string(axis)}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/jit/unpack_int4.cpp b/docker/rocm/migraphx/targets/gpu/jit/unpack_int4.cpp
new file mode 100644
index 000000000..68f2038b8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/jit/unpack_int4.cpp
@@ -0,0 +1,90 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "migraphx/instruction.hpp"
+#include "migraphx/instruction_ref.hpp"
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const unpack_int4_kernel = R"__migraphx__(
+#include <migraphx/kernels/unpack_int4.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        unpack_int4<${axis}>(xs...);
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct unpack_int4_compiler : compiler<unpack_int4_compiler>
+{
+    std::vector<std::string> names() const { return {"unpack_int4"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(normalize_permutation(options.inputs));
+        options.kernel_name    = "unpack_int4_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.front().elements()));
+
+        auto src =
+            interpolate_string(unpack_int4_kernel,
+                               {{"kernel", options.kernel_name},
+                                {"params", enum_params(options.inputs.size(), "void * private_p")},
+                                {"args", enum_params(options.inputs.size(), "private_p")},
+                                {"axis", std::to_string(v.at("axis").to<int>())}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/kernel.cpp b/docker/rocm/migraphx/targets/gpu/kernel.cpp
new file mode 100644
index 000000000..a7c79bded
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/kernel.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/errors.hpp>
+#include <migraphx/gpu/pack_args.hpp>
+#include <cassert>
+
+#ifdef _WIN32
+#include <hip/hip_ext.h>
+#else
+// extern declare the function since hip/hip_ext.h header is broken
+extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           size_t,
+                                           hipStream_t,
+                                           void**,
+                                           void**,
+                                           hipEvent_t = nullptr,
+                                           hipEvent_t = nullptr,
+                                           uint32_t   = 0);
+#endif
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+extern std::string hip_error(int error);
+
+using hip_module_ptr = MIGRAPHX_MANAGE_PTR(hipModule_t, hipModuleUnload);
+
+struct kernel_impl
+{
+    hip_module_ptr module = nullptr;
+    hipFunction_t fun     = nullptr;
+};
+
+hip_module_ptr load_module(const char* image)
+{
+    hipModule_t raw_m;
+    auto status = hipModuleLoadData(&raw_m, image);
+    hip_module_ptr m{raw_m};
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to load module: " + hip_error(status));
+    return m;
+}
+
+kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
+{
+    impl->module = load_module(image);
+    auto status  = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
+    if(hipSuccess != status)
+        MIGRAPHX_THROW("Failed to get function: " + name + ": " + hip_error(status));
+}
+
+void launch_kernel(hipFunction_t fun,
+                   hipStream_t stream,
+                   std::size_t global,
+                   std::size_t local,
+                   void* kernargs,
+                   std::size_t size,
+                   hipEvent_t start,
+                   hipEvent_t stop)
+{
+    assert(global > 0);
+    assert(local > 0);
+    void* config[] = {
+// HIP_LAUNCH_PARAM_* are macros that do horrible things
+#ifdef MIGRAPHX_USE_CLANG_TIDY
+        nullptr, kernargs, nullptr, &size, nullptr
+#else
+        HIP_LAUNCH_PARAM_BUFFER_POINTER,
+        kernargs,
+        HIP_LAUNCH_PARAM_BUFFER_SIZE,
+        &size,
+        HIP_LAUNCH_PARAM_END
+#endif
+    };
+
+    auto status = hipExtModuleLaunchKernel(fun,
+                                           global,
+                                           1,
+                                           1,
+                                           local,
+                                           1,
+                                           1,
+                                           0,
+                                           stream,
+                                           nullptr,
+                                           reinterpret_cast<void**>(&config),
+                                           start,
+                                           stop);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to launch kernel: " + hip_error(status));
+    if(stop != nullptr)
+    {
+        status = hipEventSynchronize(stop);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to sync event: " + hip_error(status));
+    }
+}
+
+void kernel::launch(hipStream_t stream,
+                    std::size_t global,
+                    std::size_t local,
+                    std::vector<void*> args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
+{
+    assert(impl != nullptr);
+    void* kernargs   = reinterpret_cast<void*>(args.data());
+    std::size_t size = args.size() * sizeof(void*);
+
+    launch_kernel(impl->fun, stream, global, local, kernargs, size, start, stop);
+}
+
+void kernel::launch(hipStream_t stream,
+                    std::size_t global,
+                    std::size_t local,
+                    const std::vector<kernel_argument>& args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
+{
+    assert(impl != nullptr);
+    std::vector<char> kernargs = pack_args(args);
+    std::size_t size           = kernargs.size();
+
+    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size, start, stop);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
new file mode 100644
index 000000000..2e5b376c2
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
@@ -0,0 +1,334 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ALGORITHM_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ALGORITHM_HPP
+
+#include <migraphx/kernels/debug.hpp>
+
+namespace migraphx {
+
+template <class T>
+constexpr void swap(T& a, T& b) noexcept
+{
+    T old = a;
+    a     = b;
+    b     = old;
+}
+
+template <class Iterator1, class Iterator2>
+constexpr void iter_swap(Iterator1 a, Iterator2 b)
+{
+    if(a == b)
+        return;
+    swap(*a, *b);
+}
+
+struct less
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x < y;
+    }
+};
+
+struct greater
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x > y;
+    }
+};
+
+template <class InputIt, class T, class BinaryOperation>
+constexpr T accumulate(InputIt first, InputIt last, T init, BinaryOperation op)
+{
+    for(; first != last; ++first)
+    {
+        init = op(static_cast<T&&>(init), *first);
+    }
+    return init;
+}
+
+template <class InputIt, class OutputIt>
+constexpr OutputIt copy(InputIt first, InputIt last, OutputIt d_first)
+{
+    while(first != last)
+    {
+        *d_first++ = *first++;
+    }
+    return d_first;
+}
+
+template <class InputIt, class OutputIt, class UnaryPredicate>
+constexpr OutputIt copy_if(InputIt first, InputIt last, OutputIt d_first, UnaryPredicate pred)
+{
+    for(; first != last; ++first)
+    {
+        if(pred(*first))
+        {
+            *d_first = *first;
+            ++d_first;
+        }
+    }
+    return d_first;
+}
+
+template <class Iterator, class Compare>
+constexpr Iterator is_sorted_until(Iterator first, Iterator last, Compare comp)
+{
+    if(first != last)
+    {
+        Iterator next = first;
+        while(++next != last)
+        {
+            if(comp(*next, *first))
+                return next;
+            first = next;
+        }
+    }
+    return last;
+}
+
+template <class Iterator, class Compare>
+constexpr bool is_sorted(Iterator first, Iterator last, Compare comp)
+{
+    return is_sorted_until(first, last, comp) == last;
+}
+
+template <class Iterator, class F>
+constexpr F for_each(Iterator first, Iterator last, F f)
+{
+    for(; first != last; ++first)
+    {
+        f(*first);
+    }
+    return f;
+}
+
+template <class Iterator, class Predicate>
+constexpr Iterator find_if(Iterator first, Iterator last, Predicate p)
+{
+    for(; first != last; ++first)
+    {
+        if(p(*first))
+        {
+            return first;
+        }
+    }
+    return last;
+}
+
+template <class Iterator, class T>
+constexpr Iterator find(Iterator first, Iterator last, const T& value)
+{
+    return find_if(first, last, [&](const auto& x) { return x == value; });
+}
+
+template <class InputIt, class UnaryPredicate>
+constexpr bool any_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return find_if(first, last, p) != last;
+}
+
+template <class InputIt, class UnaryPredicate>
+constexpr bool none_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return find_if(first, last, p) == last;
+}
+
+template <class InputIt, class UnaryPredicate>
+constexpr bool all_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return none_of(first, last, [=](auto&& x) { return not p(x); });
+}
+
+template <class Iterator1, class Iterator2>
+constexpr Iterator1 search(Iterator1 first, Iterator1 last, Iterator2 s_first, Iterator2 s_last)
+{
+    for(;; ++first)
+    {
+        Iterator1 it = first;
+        for(Iterator2 s_it = s_first;; ++it, ++s_it)
+        {
+            if(s_it == s_last)
+            {
+                return first;
+            }
+            if(it == last)
+            {
+                return last;
+            }
+            if(not(*it == *s_it))
+            {
+                break;
+            }
+        }
+    }
+}
+
+template <class InputIt1, class InputIt2, class T, class BinaryOperation1, class BinaryOperation2>
+constexpr T inner_product(InputIt1 first1,
+                          InputIt1 last1,
+                          InputIt2 first2,
+                          T init,
+                          BinaryOperation1 op1,
+                          BinaryOperation2 op2)
+{
+    while(first1 != last1)
+    {
+        init = op1(init, op2(*first1, *first2));
+        ++first1;
+        ++first2;
+    }
+    return init;
+}
+
+template <class InputIt1, class InputIt2, class T>
+constexpr T inner_product(InputIt1 first1, InputIt1 last1, InputIt2 first2, T init)
+{
+    return inner_product(
+        first1,
+        last1,
+        first2,
+        init,
+        [](auto x, auto y) { return x + y; },
+        [](auto x, auto y) { return x * y; });
+}
+
+template <class Iterator1, class Iterator2, class BinaryPred>
+constexpr bool equal(Iterator1 first1, Iterator1 last1, Iterator2 first2, BinaryPred p)
+{
+    for(; first1 != last1; ++first1, ++first2)
+        if(not p(*first1, *first2))
+        {
+            return false;
+        }
+    return true;
+}
+
+template <class Iterator, class T>
+constexpr void iota(Iterator first, Iterator last, T value)
+{
+    for(; first != last; ++first, ++value)
+        *first = value;
+}
+
+template <class Iterator, class Compare>
+constexpr Iterator min_element(Iterator first, Iterator last, Compare comp)
+{
+    if(first == last)
+        return last;
+
+    Iterator smallest = first;
+
+    while(++first != last)
+        if(comp(*first, *smallest))
+            smallest = first;
+
+    return smallest;
+}
+
+template <class Iterator>
+constexpr Iterator rotate(Iterator first, Iterator middle, Iterator last)
+{
+    if(first == middle)
+        return last;
+
+    if(middle == last)
+        return first;
+
+    Iterator write     = first;
+    Iterator next_read = first;
+
+    for(Iterator read = middle; read != last; ++write, ++read)
+    {
+        if(write == next_read)
+            next_read = read;
+        iter_swap(write, read);
+    }
+
+    rotate(write, next_read, last);
+    return write;
+}
+
+template <class Iterator, class T, class Compare>
+constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value, Compare comp)
+{
+    auto count = last - first;
+
+    while(count > 0)
+    {
+        auto it   = first;
+        auto step = count / 2;
+        it += step;
+
+        if(not comp(value, *it))
+        {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+
+    return first;
+}
+
+template <class Iterator, class Compare>
+constexpr void sort(Iterator first, Iterator last, Compare comp)
+{
+    if(first == last)
+        return;
+    for(auto i = first; i != last - 1; ++i)
+        iter_swap(i, min_element(i, last, comp));
+    MIGRAPHX_ASSERT(is_sorted(first, last, comp));
+}
+
+template <class Iterator>
+constexpr void sort(Iterator first, Iterator last)
+{
+    sort(first, last, less{});
+}
+
+template <class Iterator, class Compare>
+constexpr void stable_sort(Iterator first, Iterator last, Compare comp)
+{
+    if(first == last)
+        return;
+    for(auto i = first; i != last; ++i)
+        rotate(upper_bound(first, i, *i, comp), i, i + 1);
+    MIGRAPHX_ASSERT(is_sorted(first, last, comp));
+}
+
+template <class Iterator>
+constexpr void stable_sort(Iterator first, Iterator last)
+{
+    stable_sort(first, last, less{});
+}
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/args.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/args.hpp
new file mode 100644
index 000000000..2706e14a4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/args.hpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_ARGS_HPP
+#define MIGRAPHX_GUARD_KERNELS_ARGS_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/functional.hpp>
+
+namespace migraphx {
+
+// Use template specialization since ADL is broken on hcc
+template <index_int>
+struct make_tensor;
+
+template <class F, index_int... Ns, class... Ts>
+__device__ auto make_tensors_impl(F f, detail::seq<Ns...>, Ts*... xs)
+{
+    return f(make_tensor<Ns>::apply(xs)...);
+}
+
+inline __device__ auto make_tensors()
+{
+    return [](auto*... xs) {
+        return [=](auto f) { return make_tensors_impl(f, detail::gens<sizeof...(xs)>{}, xs...); };
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_ARGS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/array.hpp
new file mode 100644
index 000000000..623d4d8dd
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -0,0 +1,388 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/debug.hpp>
+
+namespace migraphx {
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                              \
+    template <class U>                                                                       \
+    constexpr array& operator op(const array<U, N>& x)                                       \
+    {                                                                                        \
+        array_detail::array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; });         \
+        return *this;                                                                        \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    constexpr array& operator op(const U& x)                                                 \
+    {                                                                                        \
+        array_detail::array_for_each (*this)([&](auto& sy) { sy op x; });                    \
+        return *this;                                                                        \
+    }                                                                                        \
+    template <class U>                                                                       \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y)           \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, x, y)(                                               \
+            [&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; });                      \
+        return z;                                                                            \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    friend constexpr auto operator binary_op(const array& x, const U& y)                     \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; }); \
+        return z;                                                                            \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    friend constexpr auto operator binary_op(const U& x, const array& y)                     \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; }); \
+        return z;                                                                            \
+    }
+
+namespace array_detail {
+template <class T>
+constexpr auto is_vectorizable()
+{
+    return not is_same<T, bool>{} and (is_fundamental<T>{} or is_same<T, half>{});
+}
+
+template <class T>
+__device__ auto& array2vec(T& x)
+{
+    using value_type    = typename T::value_type;
+    constexpr auto size = decltype(x.size()){};
+    using type          = vec<value_type, size>;
+    if constexpr(is_const<T>{})
+        return reinterpret_cast<const type&>(x);
+    else
+        return reinterpret_cast<type&>(x);
+}
+
+template <class T, class... Ts>
+constexpr auto array_for_each(T& x, Ts&... xs)
+{
+    MIGRAPHX_ASSERT(((x.size() == xs.size()) and ...));
+    return [&](auto f) {
+        constexpr auto size = decltype(x.size()){};
+        if constexpr((is_vectorizable<typename T::value_type>() or
+                      (is_vectorizable<typename Ts::value_type>() or ...)) and
+                     size <= 8 and size > 1 and (size % 2 == 0))
+        {
+            if(__builtin_is_constant_evaluated())
+            {
+                for(index_int i = 0; i < size; i++)
+                    f(x[i], xs[i]...);
+            }
+            else
+            {
+                using vec_type = remove_reference_t<decltype(array2vec(x))>;
+                f(array2vec(x), __builtin_convertvector(array2vec(xs), vec_type)...);
+            }
+        }
+        else
+        {
+            for(index_int i = 0; i < size; i++)
+                f(x[i], xs[i]...);
+        }
+    };
+}
+} // namespace array_detail
+
+template <class T, index_int N>
+struct array
+{
+    using value_type = T;
+    T d[N];
+
+    constexpr array() = default;
+
+    template <class... Ts,
+              MIGRAPHX_REQUIRES(sizeof...(Ts) == N and (is_convertible<Ts, T>{} and ...))>
+    constexpr array(Ts... xs) : d{xs...}
+    {
+    }
+
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{} and (N > 1))>
+    constexpr explicit array(U x)
+    {
+        for(index_int i = 0; i < N; i++)
+            d[i] = x;
+    }
+
+    constexpr T& operator[](index_int i)
+    {
+        MIGRAPHX_ASSERT(i < N);
+        return d[i];
+    }
+    constexpr const T& operator[](index_int i) const
+    {
+        MIGRAPHX_ASSERT(i < N);
+        return d[i];
+    }
+
+    constexpr T& front() { return d[0]; }
+    constexpr const T& front() const { return d[0]; }
+
+    constexpr T& back() { return d[N - 1]; }
+    constexpr const T& back() const { return d[N - 1]; }
+
+    constexpr T* data() { return d; }
+    constexpr const T* data() const { return d; }
+
+    constexpr index_constant<N> size() const { return {}; }
+    constexpr auto empty() const { return size() == _c<0>; }
+
+    constexpr T* begin() { return d; }
+    constexpr const T* begin() const { return d; }
+
+    constexpr T* end() { return d + size(); }
+    constexpr const T* end() const { return d + size(); }
+
+    constexpr T dot(const array& x) const
+    {
+        auto r = x * (*this);
+        return r.reduce([](auto a, auto b) { return a + b; }, 0);
+    }
+
+    constexpr T product() const
+    {
+        return reduce([](auto x, auto y) { return x * y; }, 1);
+    }
+
+    constexpr T single(index_int width = 100) const
+    {
+        T result = 0;
+        T a      = 1;
+        for(index_int i = 0; i < N; i++)
+        {
+            result += d[N - i - 1] * a;
+            a *= width;
+        }
+        return result;
+    }
+
+    template <class F>
+    constexpr auto apply(F f) const
+    {
+        array<decltype(f(d[0])), N> result;
+        for(index_int i = 0; i < N; i++)
+            result[i] = f(d[i]);
+        return result;
+    }
+
+    template <class F>
+    constexpr auto reduce(F f, T init) const
+    {
+        T result = init;
+        for(index_int i = 0; i < N; i++)
+            result = f(result, d[i]);
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
+    MIGRAPHX_DEVICE_ARRAY_OP(-=, -)
+    MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
+    MIGRAPHX_DEVICE_ARRAY_OP(/=, /)
+    MIGRAPHX_DEVICE_ARRAY_OP(%=, %)
+    MIGRAPHX_DEVICE_ARRAY_OP(&=, &)
+    MIGRAPHX_DEVICE_ARRAY_OP(|=, |)
+    MIGRAPHX_DEVICE_ARRAY_OP(^=, ^)
+
+    friend constexpr bool operator==(const array& x, const array& y)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(x[i] != y[i])
+                return false;
+        }
+        return true;
+    }
+
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>
+    friend constexpr bool operator==(const array& x, const U& y)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(x[i] != y)
+                return false;
+        }
+        return true;
+    }
+
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>
+    friend constexpr bool operator==(const U& x, const array& y)
+    {
+        return y == x;
+    }
+
+    template <class U>
+    friend constexpr bool operator!=(const U& x, const array& y)
+    {
+        return not(x == y);
+    }
+    template <class U>
+    friend constexpr bool operator!=(const array& x, const U& y)
+    {
+        return not(x == y);
+    }
+    // This uses the product order rather than lexical order
+    friend constexpr bool operator<(const array& x, const array& y)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(not(x[i] < y[i]))
+                return false;
+        }
+        return true;
+    }
+    friend constexpr bool operator>(const array& x, const array& y) { return y < x; }
+    friend constexpr bool operator<=(const array& x, const array& y) { return (x < y) or (x == y); }
+    friend constexpr bool operator>=(const array& x, const array& y) { return (y < x) or (x == y); }
+
+    constexpr array carry(array result) const
+    {
+        index_int overflow = 0;
+        for(diff_int i = result.size() - 1; i > 0; i--)
+        {
+            auto z = result[i] + overflow;
+            // Reset overflow
+            overflow = 0;
+            // Compute overflow using while loop instead of mod
+            while(z >= d[i])
+            {
+                z -= d[i];
+                overflow += 1;
+            }
+            result[i] = z;
+        }
+        result[0] += overflow;
+        return result;
+    }
+
+    /// Get the multi-dimensional index from the given 1D index.
+    constexpr array multi(T idx) const
+    {
+        array result;
+        index_int tidx = idx;
+        for(diff_int is = result.size() - 1; is > 0; is--)
+        {
+            result[is] = tidx % d[is];
+            tidx       = tidx / d[is];
+        }
+        result[0] = tidx;
+        return result;
+    }
+
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& ss, const array& a)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(i > 0)
+                ss << ", ";
+            ss << a[i];
+        }
+        return ss;
+    }
+};
+
+template <class F>
+constexpr auto array_apply(F f)
+{
+    return [=](auto&& x) { return x.apply(f); };
+}
+
+template <class T, class... Ts>
+constexpr array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
+{
+    return {x, static_cast<T>(xs)...};
+}
+template <class T, T... Xs>
+struct integral_const_array : array<T, sizeof...(Xs)>
+{
+    using base_array = array<T, sizeof...(Xs)>;
+    MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({Xs...}) {}
+
+    constexpr const base_array& base() const { return *this; }
+};
+
+template <class T, class... Ts>
+constexpr auto make_const_array(T x, Ts... xs)
+{
+    return integral_const_array<typename T::value_type, x, xs...>{};
+}
+
+template <class T, class N, class F>
+constexpr auto generate_array(N n, F f)
+{
+    return sequence_c<n>([=](auto... is) { return array<T, n>{f(is)...}; });
+}
+
+template <class T, T... Xs, class F>
+constexpr auto unpack(integral_const_array<T, Xs...>, F f)
+{
+    return f(_c<Xs>...);
+}
+
+template <class T, T... Xs, class F>
+constexpr auto transform(integral_const_array<T, Xs...>, F f)
+{
+    return integral_const_array<T, f(Xs)...>{};
+}
+
+template <class T, T... Xs, class F>
+constexpr auto transform_i(integral_const_array<T, Xs...>, F f)
+{
+    return sequence_c<sizeof...(Xs)>(
+        [=](auto... is) { return integral_const_array<T, f(Xs, is)...>{}; });
+}
+
+template <class T, T... Xs, class U, U... Ys, class F>
+constexpr auto transform(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>, F f)
+{
+    return integral_const_array<T, f(Xs, Ys)...>{};
+}
+
+template <class F>
+constexpr auto return_array_c(F f)
+{
+    constexpr auto r = f();
+    return sequence(r.size(), [&](auto... is) { return make_const_array(_c<r[is]>...); });
+}
+
+template <index_int... Ns>
+using index_ints = integral_const_array<index_int, Ns...>;
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/atomic.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/atomic.hpp
new file mode 100644
index 000000000..76e0409cc
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/atomic.hpp
@@ -0,0 +1,142 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_ATOMIC_HPP
+#define MIGRAPHX_GUARD_KERNELS_ATOMIC_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/bit_cast.hpp>
+#include <migraphx/kernels/vec.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/rank.hpp>
+
+#ifndef MIGRAPHX_ALLOW_ATOMIC_CAS
+// NOLINTNEXTLINE
+#define MIGRAPHX_ALLOW_ATOMIC_CAS 0
+#endif
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_ATOMIC_CAS_WARNING() \
+    MIGRAPHX_ASSERT(MIGRAPHX_ALLOW_ATOMIC_CAS and "Using atomicCAS is slow")
+
+namespace migraphx {
+namespace atomic {
+
+using cas_rank = rank<1>;
+
+template <class T, class Op, MIGRAPHX_REQUIRES(sizeof(T) == 4 or sizeof(T) == 8)>
+MIGRAPHX_DEVICE_CONSTEXPR void cas(rank<1>, T& x, T y, Op op)
+{
+    MIGRAPHX_ATOMIC_CAS_WARNING();
+    using storage    = conditional_t<sizeof(T) == 4, uint32_t, uint64_t>;
+    storage* address = reinterpret_cast<storage*>(&x);
+    storage expected = __hip_atomic_load(address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    while(not __hip_atomic_compare_exchange_strong(address,
+                                                   &expected,
+                                                   bit_cast<storage>(op(bit_cast<T>(expected), y)),
+                                                   __ATOMIC_RELAXED,
+                                                   __ATOMIC_RELAXED,
+                                                   __HIP_MEMORY_SCOPE_AGENT))
+    {
+    }
+}
+
+template <class T, index_int N, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto cas(rank<0>, vec<T, N>& x, vec<T, N> y, Op op)
+    -> decltype(cas(cas_rank{}, x[0], y[0], op), void())
+{
+    for(index_int i = 0; i < N; i++)
+    {
+        cas(cas_rank{}, x[i], y[i], op);
+    }
+}
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR auto builtin_assign(T& x, T y, op::sum)
+    MIGRAPHX_RETURNS(unsafeAtomicAdd(&x, y));
+
+__device__ inline void builtin_assign(half2& x, half2 y, op::sum)
+{
+    __builtin_amdgcn_global_atomic_fadd_v2f16(&x, y);
+}
+
+template <class T>
+constexpr bool is_aligned(const void* ptr)
+{
+    auto iptr = bit_cast<uintptr_t>(ptr);
+    return (iptr % alignof(T)) == 0;
+}
+
+__device__ inline void builtin_assign(half& x, half y, op::sum)
+{
+    half* address = &x;
+    if(is_aligned<float>(address))
+    {
+        __builtin_amdgcn_global_atomic_fadd_v2f16(address, half2{half(y), half(0)});
+    }
+    else
+    {
+        __builtin_amdgcn_global_atomic_fadd_v2f16(address - 1, half2{half(0), half(y)});
+    }
+}
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR auto builtin_assign(T& x, T y, op::min)
+    MIGRAPHX_RETURNS(unsafeAtomicMin(&x, y));
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR auto builtin_assign(T& x, T y, op::max)
+    MIGRAPHX_RETURNS(unsafeAtomicMax(&x, y));
+
+template <class T, index_int N, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto builtin_assign(vec<T, N>& x, vec<T, N> y, Op op)
+    -> decltype(builtin_assign(x[0], y[0], op), void())
+{
+    for(index_int i = 0; i < N; i++)
+    {
+        builtin_assign(x[i], y[i], op);
+    }
+}
+
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto assign(rank<0>, T& x, T y, Op op)
+    MIGRAPHX_RETURNS(cas(cas_rank{}, x, y, op));
+
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto assign(rank<1>, T& x, T y, Op op)
+    MIGRAPHX_RETURNS(builtin_assign(x, y, op));
+
+} // namespace atomic
+
+template <class T, class U, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR void atomic_assign(T& x, U y, Op op)
+{
+    atomic::assign(rank<1>{}, x, T(y), op);
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_ATOMIC_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
new file mode 100644
index 000000000..e559658a0
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
@@ -0,0 +1,42 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/vec.hpp>
+
+namespace migraphx {
+
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(is_trivially_copyable<To>{} and is_trivially_copyable<From>{})>
+inline constexpr auto bit_cast(From fr) noexcept
+{
+    return vec_transform(fr)([](auto x) -> To {
+        static_assert(sizeof(To) == sizeof(decltype(x)));
+        return __builtin_bit_cast(To, x);
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
new file mode 100644
index 000000000..de22e7b07
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
@@ -0,0 +1,175 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_HPP
+
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
+#include <ck/tensor_description/tensor_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
+
+namespace migraphx {
+
+namespace detail {
+template <class T>
+struct to_ck_type_impl
+{
+    using type = T;
+};
+template <>
+struct to_ck_type_impl<migraphx::half>
+{
+    using type = ck::half_t;
+};
+
+template <class T>
+struct to_ck_type_impl<const T>
+{
+    using type = const typename to_ck_type_impl<T>::type;
+};
+
+template <class Shape>
+constexpr bool is_row_major()
+{
+    constexpr auto strides = Shape{}.strides;
+    MIGRAPHX_ASSERT(strides.size() >= 2);
+    if(strides.back() == 1)
+    {
+        MIGRAPHX_ASSERT(not Shape{}.is_transposed());
+        return true;
+    }
+    MIGRAPHX_ASSERT(strides[strides.size() - 2] == 1);
+
+    return false;
+}
+
+} // namespace detail
+
+template <class T>
+using to_ck_type = typename detail::to_ck_type_impl<T>::type;
+
+template <class T>
+constexpr auto to_ck_pointer(T* x)
+{
+    return static_cast<to_ck_type<T>*>(x);
+}
+
+template <class T>
+constexpr auto to_ck_const_pointer(const T* x)
+{
+    return static_cast<const to_ck_type<T>*>(x);
+}
+
+template <class Shape>
+using to_ck_gemm_layout = conditional_t<detail::is_row_major<get_shape_c<Shape>>(),
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor>;
+
+template <class Tensor>
+constexpr auto to_ck_tensor()
+{
+    constexpr auto s = get_shape_c<Tensor>{};
+    return sequence(s.lens.size(), [&](auto... is) {
+        return ck::make_naive_tensor_descriptor(ck::make_tuple(s.lens[is]...),
+                                                ck::make_tuple(s.strides[is]...));
+    });
+}
+
+template <class F>
+struct ck_function_adaptor : F
+{
+    template <class... Ts>
+    constexpr ck_function_adaptor(Ts&&... xs) : F(static_cast<Ts&&>(xs)...)
+    {
+    }
+
+    template <class T, class... Ts>
+    constexpr void operator()(T& out, Ts&&... xs) const
+    {
+        out = static_cast<const F&>(*this)(static_cast<Ts&&>(xs)...);
+    }
+};
+
+struct ck_nop
+{
+    template <class T>
+    constexpr void operator()(T&) const
+    {
+    }
+};
+
+struct ck_passthrough
+{
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y = x;
+    }
+};
+
+struct ck_scale
+{
+    constexpr ck_scale(float s) : scale(s) {}
+
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y = x * static_cast<U>(scale);
+    }
+
+    float scale;
+};
+
+struct ck_add
+{
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y += x;
+    }
+};
+
+// In CK, the B matrix is ordered as N,K instead of K,N
+template <class Dims>
+constexpr auto ck_transposeb_dims(Dims dims)
+{
+    return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); });
+}
+
+template <class Tensor>
+using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c<Tensor>{}.lens),
+                                          ck_transposeb_dims(get_shape_c<Tensor>{}.strides)));
+
+#ifdef MIGRAPHX_CK_CHECK
+#define MIGRAPHX_CK_STATIC_ASSERT static_assert
+#else
+#define MIGRAPHX_CK_STATIC_ASSERT(...)
+#endif
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_CK_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
new file mode 100644
index 000000000..ccef52132
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ck.hpp>
+#include <migraphx/kernels/gemm_batcher.hpp>
+
+namespace migraphx {
+
+template <class G, class E, class A, class B, class... Ds>
+__device__ void ck_gemm_matrix(E e, A a, B b, Ds... ds)
+{
+    constexpr auto desc = G::make_descriptor(to_ck_tensor<A>(),
+                                             to_ck_tensor<ck_transposeb<B>>(),
+                                             ck::make_tuple(to_ck_tensor<Ds>()...),
+                                             to_ck_tensor<E>());
+
+    MIGRAPHX_STATIC_ASSERT_FOR(desc.IsValid())
+    {
+        G::Run(desc,
+               to_ck_const_pointer(a.data()),
+               to_ck_const_pointer(b.data()),
+               ck::make_tuple(to_ck_const_pointer(ds.data())...),
+               to_ck_pointer(e.data()));
+    }
+}
+
+template <class G, index_int BlocksPerBatch, class... Ts>
+__device__ void ck_gemm(Ts... xs)
+{
+    gemm_batch_args(make_index(), _c<BlocksPerBatch>, xs...)(
+        [](auto... ys) { ck_gemm_matrix<G>(ys...); });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
new file mode 100644
index 000000000..8e381f375
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
@@ -0,0 +1,75 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ck.hpp>
+#include <migraphx/kernels/gemm_batcher.hpp>
+
+namespace migraphx {
+
+template <class T>
+struct ck_gemm_softmax_gemm_settings
+{
+    T scale{};
+};
+
+template <class... Ts>
+constexpr ck_gemm_softmax_gemm_settings<Ts...> make_ck_gemm_softmax_gemm_settings(Ts... xs)
+{
+    return {xs...};
+}
+
+template <class G, class C, class A, class B, class B1, class Settings>
+__device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s)
+{
+    constexpr auto desc = G::make_descriptor(to_ck_tensor<A>(),
+                                             to_ck_tensor<ck_transposeb<B>>(),
+                                             to_ck_tensor<ck_transposeb<B1>>(),
+                                             to_ck_tensor<C>());
+
+    MIGRAPHX_STATIC_ASSERT_FOR(desc.IsValid())
+    {
+        G::Run(desc,
+               s.scale,
+               to_ck_const_pointer(a.data()),
+               to_ck_const_pointer(b.data()),
+               to_ck_const_pointer(b1.data()),
+               to_ck_pointer(c.data()));
+    }
+}
+
+template <class G, index_int BlocksPerBatch, class... Ts, class Settings>
+__device__ void ck_gemm_softmax_gemm(Settings s, Ts... xs)
+{
+    gemm_batch_args(make_index(), _c<BlocksPerBatch>, xs...)(
+        [&](auto... ys) { ck_gemm_softmax_gemm_matrix<G>(ys..., s); });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_probabilities.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_probabilities.hpp
new file mode 100644
index 000000000..5838d9874
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_probabilities.hpp
@@ -0,0 +1,111 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_COMPUTE_ATTENTION_PROBABILITIES_HPP
+#define MIGRAPHX_GUARD_KERNELS_COMPUTE_ATTENTION_PROBABILITIES_HPP
+
+#include <migraphx/kernels/group_query_attention.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <class AttnProbs,
+          class Query,
+          class SeqLensK,
+          class PresentKey,
+          class Params>
+__device__ void
+calculate_attention_probs(AttnProbs attention_probs, // output buffer with size BxNxSxT
+                          Query query,               // Q data. Its size is BxNxSxH
+                          SeqLensK seqlens_k,        // past sequence lengths tensor
+                          PresentKey present_key,    // present key only
+                          Params params,
+                          index_int idx)
+{
+    const index_int batch_size                     = params.batch_size;
+    const index_int sequence_length                = params.sequence_length;
+    const index_int head_size                      = params.head_size;
+    const index_int present_buffer_sequence_length = params.seqlen_present_kv_cache;
+    const index_int num_heads                      = params.num_heads;
+    const index_int kv_num_heads                   = params.kv_num_heads;
+    const index_int packed_batch_stride =
+        (num_heads + 2 * kv_num_heads) * sequence_length * head_size;
+    const index_int kv_num_heads_factor       = num_heads / kv_num_heads;
+    const index_int q_input_chunk_length      = sequence_length * head_size;                // S x H
+    const index_int present_buff_chunk_length = present_buffer_sequence_length * head_size; // T x H
+
+    const index_int loop_len = batch_size * num_heads;
+    const float alpha =
+        params.scale == 0.0f ? 1.0f / sqrt(static_cast<float>(head_size)) : params.scale;
+
+    const index_int i       = idx / (sequence_length * present_buffer_sequence_length);
+    const index_int inner_i = idx % (sequence_length * present_buffer_sequence_length);
+    if(i < loop_len)
+    {
+        const auto batch_index        = i / num_heads;
+        const auto head_index         = i % num_heads;
+        const index_int total_seqlen  = seqlens_k[batch_index] + 1;
+        const index_int output_offset = i * sequence_length * present_buffer_sequence_length;
+        auto output                   = attention_probs + output_offset;
+        auto pk = present_key + ((i / kv_num_heads_factor) * present_buff_chunk_length);
+        auto q  = query + packed_batch_stride * batch_index + q_input_chunk_length * head_index;
+
+        naive_gemm gemm{sequence_length,
+                        total_seqlen,
+                        head_size,
+                        head_size,
+                        head_size,
+                        present_buffer_sequence_length,
+                        true,
+                        alpha,
+                        0.0f};
+        gemm.compute(output, q, pk, inner_i);
+    }
+}
+
+template <class Output,
+          class Query,
+          class PresentKey,
+          class PresentValue,
+          class SeqLensK,
+          class Params>
+__device__ void compute_attention_probabilities(Output output,
+                                                Query query,
+                                                PresentKey present_key,
+                                                PresentValue,
+                                                SeqLensK seqlens_k,
+                                                Params params)
+{
+    auto ind = make_index();
+    ind.global_stride(
+        params.batch_size * params.num_heads * params.sequence_length *
+            params.seqlen_present_kv_cache,
+        [&](auto idx) {
+            calculate_attention_probs(
+                output.begin(), query.begin(), seqlens_k.begin(), present_key.begin(), params, idx);
+        });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_scores.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_scores.hpp
new file mode 100644
index 000000000..f51cfacd5
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/compute_attention_scores.hpp
@@ -0,0 +1,112 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_COMPUTE_ATTENTION_SCORES_HPP
+#define MIGRAPHX_GUARD_KERNELS_COMPUTE_ATTENTION_SCORES_HPP
+
+#include <migraphx/kernels/group_query_attention.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <class Output,
+          class AttnProbs,
+          class SeqLensK,
+          class PresentValue,
+          class Params>
+__device__ void
+calculate_attention_score(Output output, // buffer for the result with size BxSxNxH
+                          const AttnProbs attention_probs, // Attention probs with size BxNxSxT
+                          const SeqLensK seqlens_k,        // past sequence lengths tensor
+                          PresentValue present_value,      // present value only
+                          Params params,
+                          index_int idx)
+{
+    const index_int batch_size                     = params.batch_size;
+    const index_int num_heads                      = params.num_heads;
+    const index_int sequence_length                = params.sequence_length;
+    const index_int head_size                      = params.head_size;
+    const index_int hidden_size                    = params.hidden_size;
+    const index_int present_buffer_sequence_length = params.seqlen_present_kv_cache;
+    const index_int kv_num_heads                   = params.kv_num_heads;
+    const index_int kv_num_heads_factor            = num_heads / kv_num_heads;
+    const index_int present_buff_chunk_length = present_buffer_sequence_length * head_size; // T x H
+
+    auto loop_len           = batch_size * num_heads;
+    const index_int i       = idx / (sequence_length * head_size);
+    const index_int inner_i = idx % (sequence_length * head_size);
+    if(i < loop_len)
+    {
+        const index_int batch_index  = i / num_heads;
+        const index_int head_index   = i % num_heads;
+        const index_int total_seqlen = seqlens_k[batch_index] + 1;
+
+        auto pv = present_value + ((i / kv_num_heads_factor) * present_buff_chunk_length);
+        Output output_current =
+            output + (batch_index * sequence_length * num_heads + head_index) * head_size;
+        ptrdiff_t attention_probs_offset = sequence_length * present_buffer_sequence_length * i;
+
+        naive_gemm gemm{sequence_length,
+                        head_size,
+                        total_seqlen,
+                        present_buffer_sequence_length,
+                        head_size,
+                        hidden_size,
+                        false,
+                        1.0f,
+                        0.0f};
+        gemm.compute(output_current, attention_probs + attention_probs_offset, pv, inner_i);
+    }
+}
+
+template <class Output,
+          class Query,
+          class PresentKey,
+          class PresentValue,
+          class SeqLensK,
+          class AttnProbs,
+          class Params>
+__device__ void compute_attention_scores(Output output,
+                                         Query,
+                                         PresentKey,
+                                         PresentValue present_value,
+                                         SeqLensK seqlens_k,
+                                         AttnProbs attn_probs,
+                                         Params params)
+{
+    const index_int elements =
+        params.batch_size * params.num_heads * params.sequence_length * params.head_size;
+    auto ind = make_index();
+    ind.global_stride(elements, [&](auto idx) {
+        calculate_attention_score(output.begin(),
+                                  attn_probs.begin(),
+                                  seqlens_k.begin(),
+                                  present_value.begin(),
+                                  params,
+                                  idx);
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
new file mode 100644
index 000000000..9dd2ec6b6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
@@ -0,0 +1,87 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+#ifndef MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
+#define MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
+
+namespace migraphx {
+
+template <index_int Axis, class Output, class Input, class Start>
+constexpr auto concat_slice(Output out, Input, Start)
+{
+    constexpr auto lens    = get_shape_c<Input>{}.lens;
+    constexpr auto strides = get_shape_c<Output>{}.strides;
+    constexpr auto offset  = return_c([] {
+        constexpr auto output_shape = get_shape_c<Output>{};
+        return Start{} * output_shape.strides[Axis];
+    });
+    constexpr auto s       = make_shape(lens, strides);
+    MIGRAPHX_ASSERT(offset < out.get_shape().element_space());
+    MIGRAPHX_ASSERT((s.element_space() + offset) <= out.get_shape().element_space());
+    return make_tensor_view(out.data() + offset, s);
+}
+
+template <index_int Axis, class Input, class Start, class... Ts>
+constexpr auto concat_slices(Input input, Start start, Ts... xs)
+{
+    return [=](auto f) { return f(concat_slice<Axis>(xs, input, start)...); };
+}
+
+template <index_int Axis, class Input>
+constexpr auto concat_ends(Input)
+{
+    constexpr auto lens = get_shape_c<Input>{}.lens;
+    return _c<lens[Axis]>;
+}
+
+template <index_int Axis, class Start, class InputPack, class F, class... Ts>
+__device__ auto concat_each(index idx, Start start, InputPack input_pack, F f, Ts... ts)
+{
+    return input_pack([&](auto g, auto x, auto... xs) {
+        return concat_slices<Axis>(x, start, ts...)([&](auto z, auto... ys) {
+            idx.global_stride(x.get_shape().elements(),
+                              [&](auto i) { z[i] = f(g(x[i], xs[i]...), ys[i]...); });
+
+            return start + concat_ends<Axis>(x);
+        });
+    });
+}
+
+template <index_int Axis, class... InputPacks>
+__device__ auto concat(InputPacks... input_packs)
+{
+    return [=](auto f, auto... ts) {
+        auto idx = make_index();
+        fold([&](auto start, auto input_pack) {
+            return concat_each<Axis>(idx, start, input_pack, f, ts...);
+        })(_c<0>, input_packs...);
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat_past_present.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat_past_present.hpp
new file mode 100644
index 000000000..dcfebbbe0
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/concat_past_present.hpp
@@ -0,0 +1,141 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CONCAT_PAST_PRESENT_HPP
+#define MIGRAPHX_GUARD_KERNELS_CONCAT_PAST_PRESENT_HPP
+
+#include <migraphx/kernels/group_query_attention.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <class Dest, class Src>
+__device__ void copy_data(Dest destination, const Src source, index_int n, index_int idx)
+{
+    if(idx < n)
+    {
+        destination[idx] = source[idx];
+    }
+}
+
+struct concat_state_chunk
+{
+    index_int present_buff_chunk_length;
+    index_int past_buff_chunk_length;
+    index_int past_chunk_length;
+    index_int new_chunk_length;
+    bool is_prompt;
+    bool past_present_share_buffer;
+    std::ptrdiff_t i;
+
+    template <class Past, class Chunk, class Present>
+    __device__ Present compute(Past past, const Chunk chunk, Present present, index_int idx)
+    {
+        auto start = present + i * present_buff_chunk_length;
+
+        auto p = start;
+        if(not is_prompt)
+        {
+            if(not past_present_share_buffer)
+            {
+                const auto src_past = past + i * past_buff_chunk_length;
+                copy_data(p, src_past, past_chunk_length, idx);
+            }
+            p += past_chunk_length;
+        }
+        copy_data(p, chunk, new_chunk_length, idx);
+        return start;
+    }
+};
+
+template <class Present, class SeqLensK, class Cache, class Params>
+__device__ void
+update_cache(const Present present, SeqLensK seqlens_k, Cache cache, Params params, index_int idx)
+{
+    const index_int batch_size                     = params.batch_size;
+    const index_int sequence_length                = params.sequence_length;
+    const index_int head_size                      = params.head_size;
+    const index_int past_buffer_sequence_length    = params.seqlen_present_kv_cache;
+    const index_int present_buffer_sequence_length = past_buffer_sequence_length;
+    const index_int num_heads                      = params.num_heads;
+    const index_int kv_num_heads                   = params.kv_num_heads;
+    const bool is_prompt                           = sequence_length != 1;
+    const index_int packed_batch_stride =
+        (num_heads + 2 * kv_num_heads) * sequence_length * head_size;
+    const index_int kv_num_heads_factor       = num_heads / kv_num_heads;
+    const index_int kv_input_chunk_length     = sequence_length * head_size;                // L x H
+    const index_int past_buff_chunk_length    = past_buffer_sequence_length * head_size;    // L x H
+    const index_int present_buff_chunk_length = present_buffer_sequence_length * head_size; // T x H
+
+    const index_int loop_len = batch_size * num_heads;
+    const index_int i        = idx / (sequence_length * head_size);
+    const index_int inner_i  = idx % (sequence_length * head_size);
+    if(i < loop_len)
+    {
+        const index_int batch_index       = i / num_heads;
+        const index_int head_index        = i % num_heads;
+        const index_int past_seqlen       = sequence_length == 1
+                                                ? static_cast<index_int>(seqlens_k[batch_index])
+                                                : past_buffer_sequence_length;
+        const index_int past_chunk_length = past_seqlen * head_size;
+
+        auto current = present + packed_batch_stride * batch_index +
+                       kv_input_chunk_length * (head_index / kv_num_heads_factor);
+
+        concat_state_chunk concat{present_buff_chunk_length,
+                                  past_buff_chunk_length,
+                                  past_chunk_length,
+                                  kv_input_chunk_length,
+                                  is_prompt,
+                                  params.past_present_share_buffer,
+                                  i / kv_num_heads_factor};
+        concat.compute(cache, current, cache, inner_i);
+    }
+}
+
+template <class Query, class PastKey, class PastValue, class SeqLensK, class Params>
+__device__ void concat_past_present(
+    const Query query, PastKey past_key, PastValue past_value, SeqLensK seqlens_k, Params params)
+{
+    auto ind = make_index();
+    auto elements =
+        2 * params.batch_size * params.kv_num_heads * params.sequence_length * params.head_size;
+    ind.global_stride(elements, [&](auto idx) {
+        auto q = query.begin();
+        auto k = q + params.num_heads * params.sequence_length * params.head_size;
+        auto v = q + (params.num_heads + params.kv_num_heads) * params.sequence_length *
+                         params.head_size;
+        if(idx < elements / 2)
+        {
+            update_cache(k, seqlens_k, past_key.begin(), params, idx);
+        }
+        else if(idx < elements)
+        {
+            update_cache(v, seqlens_k, past_value.begin(), params, idx - (elements / 2));
+        }
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/copy.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/copy.hpp
new file mode 100644
index 000000000..972988992
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/copy.hpp
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_COPY_HPP
+#define MIGRAPHX_GUARD_KERNELS_COPY_HPP
+
+#include <migraphx/kernels/print.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+
+namespace migraphx {
+
+template <class Index, class T, class U, class Size>
+__device__ void local_vector_copy(Index idx, T* src, U* dst, Size size)
+{
+    constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
+    auto vsrc        = as_vec<n>(remove_bool(src));
+    auto vdst        = as_vec<n>(remove_bool(dst));
+    index_int vsize  = size / n;
+    idx.local_stride(vsize, [&](auto i) { vdst[i] = vsrc[i]; });
+}
+
+template <class Index, class T, class U>
+__device__ void local_tensor_copy(Index idx, T src, U dst)
+{
+    constexpr auto src_shape = get_shape_c<T>{};
+    constexpr auto dst_shape = get_shape_c<U>{};
+    if constexpr(src_shape == dst_shape and (src_shape.packed() or src_shape.broadcasted()))
+    {
+        local_vector_copy(idx, src.data(), dst.data(), src_shape.element_space());
+    }
+    else
+    {
+        constexpr auto perm = find_permutation(src_shape, dst_shape);
+        auto new_src        = reorder_tensor_view(src, perm);
+        auto new_dst        = reorder_tensor_view(dst, perm);
+        auto_vectorize()(new_src, new_dst)([&](auto vsrc, auto vdst) {
+            index_int size = vsrc.get_shape().elements();
+            idx.local_stride(size, [&](auto i) { vdst[i] = vsrc[i]; });
+        });
+    }
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_COPY_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/debug.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
new file mode 100644
index 000000000..5e5e16b13
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -0,0 +1,230 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
+#define MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
+
+#include <migraphx/kernels/hip.hpp>
+
+namespace migraphx {
+
+#define MIGRAPHX_STRINGIZE_1(...) #__VA_ARGS__
+#define MIGRAPHX_STRINGIZE(...) MIGRAPHX_STRINGIZE_1(__VA_ARGS__)
+
+// Workaround hip's broken abort on device code
+#ifdef __HIP_DEVICE_COMPILE__
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIP_NORETURN
+#else
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIP_NORETURN [[noreturn]]
+#endif
+
+namespace debug {
+struct swallow
+{
+    template <class... Ts>
+    constexpr swallow(Ts&&...)
+    {
+    }
+};
+
+template <size_t N>
+struct print_buffer
+{
+    char buffer[N + 1] = {0};
+    char* pos          = buffer;
+
+    constexpr void append(char c)
+    {
+        if(c == 0)
+            return;
+        if(pos < buffer + N)
+        {
+            *pos = c;
+            pos++;
+        }
+    }
+    static constexpr void reverse(char* first, char* last)
+    {
+        if(first == last)
+            return;
+        last--;
+        while(first < last)
+        {
+            char tmp = *first;
+            *first   = *last;
+            *last    = tmp;
+            first++;
+            last--;
+        }
+    }
+
+    template <class T, class = decltype(T{} % 10, -T{})>
+    constexpr void append(T i)
+    {
+        if(i < 0)
+        {
+            append('-');
+            i = -i;
+        }
+        if(i == 0)
+        {
+            append('0');
+            return;
+        }
+        char* start = pos;
+        while(i != 0)
+        {
+            char c = (i % 10) + '0';
+            append(c);
+            i = i / 10;
+        }
+        reverse(start, pos);
+    }
+
+    constexpr void append(const char* str)
+    {
+        if(str == nullptr)
+            return;
+        int i = 512;
+        while(*str != 0 and i > 0)
+        {
+            append(*str);
+            str++;
+            i--;
+        }
+    }
+
+    template <size_t M>
+    constexpr void append(const char (&array)[M])
+    {
+        for(int i = 0; i < M; i++)
+            append(array[i]);
+    }
+};
+
+template <class... Ts>
+__host__ __device__ void print(const Ts&... xs)
+{
+    print_buffer<1024> buffer;
+    swallow{(buffer.append(xs), 0)...};
+    printf("%s", buffer.buffer);
+}
+
+} // namespace debug
+
+struct source_location
+{
+    int line             = __builtin_LINE();
+    const char* file     = __builtin_FILE();
+    const char* function = __builtin_FUNCTION();
+};
+
+template <class T>
+struct source_location_capture
+{
+    T x;
+    source_location loc;
+    // declval is a workaround since default constructor for "U" is not working with rocm-5.6
+    template <class U>
+    static U&& declval();
+    template <class U, class = decltype(T(declval<U>()))>
+    constexpr source_location_capture(U px, source_location ploc = source_location{})
+        : x(px), loc(ploc)
+    {
+    }
+    template <class U, class = decltype(T(declval<U>()))>
+    constexpr source_location_capture(source_location_capture<U> slc) : x(slc.x), loc(slc.loc)
+    {
+    }
+
+    constexpr operator source_location() const { return loc; }
+
+    constexpr operator T() const { return x; }
+};
+
+template <class T, class F>
+constexpr auto capture_transform(source_location_capture<T> slc, F f)
+{
+    auto r = f(slc.x);
+    return source_location_capture<decltype(r)>(r, slc.loc);
+}
+
+template <class T, class F>
+constexpr auto capture_transform(T x, F f)
+{
+    return f(x);
+}
+
+// noreturn cannot be used on this function because abort in hip is broken
+template <class T1, class T2, class T3, class T4>
+MIGRAPHX_HIP_NORETURN inline __host__ __device__ void
+assert_fail(const T1& assertion, const T2& file, const T3& line, const T4& function)
+{
+    // printf is broken on hip with more than one argument, so use a simple print functions instead
+    debug::print(file, ":", line, ": ", function, ": assertion '", assertion, "' failed.\n");
+    // printf("%s:%s: %s: assertion '%s' failed.\n", file, line, function, assertion);
+    abort();
+}
+
+template <class... Ts>
+MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_location& loc,
+                                                                  Ts... xs)
+{
+    debug::print(loc.file, ":", loc.line, ": ", loc.function, ": error: ", xs..., "\n");
+    abort();
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_ASSERT_FAIL(cond, ...)                     \
+    ((cond) ? void(0) : [](auto&&... private_migraphx_xs) { \
+        assert_fail(private_migraphx_xs...);                \
+    }(__VA_ARGS__))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_CHECK(cond) \
+    MIGRAPHX_ASSERT_FAIL(cond, #cond, __FILE__, __LINE__, __PRETTY_FUNCTION__)
+
+#ifdef MIGRAPHX_DEBUG
+// NOLINTNEXTLINE
+#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) source_location_capture<T>
+#define MIGRAPHX_WARN(cond, loc, ...) MIGRAPHX_ASSERT_FAIL(cond, loc, __VA_ARGS__)
+#define MIGRAPHX_ASSERT MIGRAPHX_CHECK
+#define MIGRAPHX_ASSUME MIGRAPHX_CHECK
+#define MIGRAPHX_UNREACHABLE() MIGRAPHX_ASSERT(false)
+#else
+// NOLINTNEXTLINE
+#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) T
+#define MIGRAPHX_ASSUME __builtin_assume
+#define MIGRAPHX_UNREACHABLE __builtin_unreachable
+#define MIGRAPHX_ASSERT(cond)
+#define MIGRAPHX_WARN(...)
+#endif
+
+#define MIGRAPHX_STATIC_ASSERT_FOR(...) \
+    static_assert(__VA_ARGS__);         \
+    if constexpr(__VA_ARGS__)
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
new file mode 100644
index 000000000..d8255d4b9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+
+namespace migraphx {
+
+// Multidimensional for loop
+inline constexpr auto dfor()
+{
+    return [](auto f) { f(); };
+}
+
+template <class T, class... Ts>
+constexpr auto dfor(T x, Ts... xs)
+{
+    return [=](auto f) {
+        for(T i = 0; i < x; i++)
+        {
+            dfor(xs...)([&](Ts... is) { f(i, is...); });
+        }
+    };
+}
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
new file mode 100644
index 000000000..5ae4c6866
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
@@ -0,0 +1,101 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_DPP_HPP
+#define MIGRAPHX_GUARD_KERNELS_DPP_HPP
+
+#include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/debug.hpp>
+
+namespace migraphx {
+
+constexpr bool is_power_of_2(unsigned int x) { return x > 0 and (x & (x - 1)) == 0u; }
+
+#ifndef MIGRAPHX_HAS_DPP
+#define MIGRAPHX_HAS_DPP 1
+#endif
+
+#if MIGRAPHX_HAS_DPP
+constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
+
+constexpr unsigned int dpp_row_bcast(unsigned int x)
+{
+    unsigned int y = 0;
+    switch(x)
+    {
+    case 15: y = 0x142; break;
+    case 31: y = 0x143; break;
+    default: MIGRAPHX_UNREACHABLE();
+    }
+    return y;
+}
+
+template <class T, class F>
+__device__ T dpp_op(T& x, F f)
+{
+    static const index_int n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
+    union type
+    {
+        uint32_t reg[n];
+        T data;
+    };
+    type output{};
+    type input{};
+    // cppcheck-suppress unreadVariable
+    input.data = x;
+    for(index_int i = 0; i < n; i++)
+    {
+        output.reg[i] = f(input.reg[i]);
+    }
+    return output.data;
+}
+
+template <unsigned int DppCtrl,
+          unsigned int RowMask  = 0xf,
+          unsigned int BankMask = 0xf,
+          bool BoundCtrl        = false,
+          class T>
+__device__ T dpp_mov(T& x)
+{
+    return dpp_op(x,
+                  [](auto i) { return __hip_move_dpp(i, DppCtrl, RowMask, BankMask, BoundCtrl); });
+}
+
+template <unsigned int Mask, class T>
+__device__ T dpp_swizzle(T& x)
+{
+    return dpp_op(x, [](auto i) { return __hip_ds_swizzle(i, Mask); });
+}
+
+template <unsigned int SrcLane, unsigned int Width, class T>
+__device__ T readlane(T& x)
+{
+    static_assert(is_power_of_2(Width), "Width must be a power of 2");
+    return dpp_op(x, [](auto i) { return __shfl(i, SrcLane, Width); });
+}
+
+#endif // MIGRAPHX_HAS_DPP
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_DPP_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
new file mode 100644
index 000000000..8227ae220
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
@@ -0,0 +1,567 @@
+/* ************************************************************************
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (C) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#define MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wc++20-extensions" // required for "asm" inside constexpr
+#endif                                                // __clang__
+
+// We are clipping in down conversion by default
+#define MIGRAPHX_F8_DOWNCAST_CLIPPING 1 // NOLINT
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/float8_impl.hpp>
+
+namespace migraphx {
+namespace fp8 {
+
+enum class rounding_mode
+{
+    standard, // standard rounding is doing RNE -- round to nearest even
+    stochastic
+};
+
+enum class f8_type
+{
+    bf8 = 0, // s1e5m2
+    fp8 = 1  // s1e4m3
+};
+
+template <typename T>
+class numeric_limits;
+
+template <migraphx::fp8::f8_type T = migraphx::fp8::f8_type::fp8, bool FNUZ = true>
+struct float8
+{
+    uint8_t data;
+    // default constructor
+    __device__ constexpr float8() = default;
+    // default copy constructor
+    __device__ constexpr float8(const float8& y) = default;
+    struct from_bits_t
+    {
+    };
+    static constexpr __device__ from_bits_t from_bits() { return from_bits_t(); }
+
+    __device__ explicit constexpr float8(uint8_t bits, from_bits_t) : data(bits) {}
+
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // device specific optimized F8 down-conversion code
+
+    template <bool stochastic_rounding = false>
+    static __device__ uint8_t cast_to_f8fnuz_from_f32(float v, uint32_t rng = 0)
+    {
+        uint8_t i8data = 0x00;
+        union
+        {
+            float fval;
+            uint32_t i32val;
+            uint8_t i8val[4]; // NOTE: not endian independent
+        } val;
+
+        uint32_t ival = 0;
+        val.fval      = v;
+
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) // propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+        }
+#endif
+        if(stochastic_rounding)
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+        }
+        else // RNE CVT
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_pk_fp8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_pk_bf8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0}
+            }
+        }
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+
+        return i8data;
+    }
+#endif // __gfx940__
+
+       // constructor from float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+
+    // NOTE: ON-DEVICE... always optimal bias
+    explicit constexpr __device__
+    float8(const float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        if(__builtin_is_constant_evaluated() or !FNUZ)
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            }
+            else
+            {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_FP8_DOWNCAST_CLIPPING}
+            }
+        }
+        else
+        {
+            // runtime branch, use cast_to_f8fnuz_from_f32 if want to avoid it
+            if(rm == migraphx::fp8::rounding_mode::stochastic)
+                data = cast_to_f8fnuz_from_f32<true>(v, rng);
+            else
+                data = cast_to_f8fnuz_from_f32<false>(v);
+        }
+    }
+#else
+    // DEVICE for non-gfx940 using s/w simulation
+    explicit constexpr __device__
+    float8(const float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+        }
+        else
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_FP8_DOWNCAST_CLIPPING}
+        }
+    }
+#endif // __gfx940___
+
+    // Constructor from half
+    explicit constexpr __device__
+    float8(const _Float16 v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+
+    // constructor from int
+    explicit constexpr __device__
+    float8(const int v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+
+    // constructor from uint
+    explicit constexpr __device__
+    float8(const uint32_t v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+
+    // constructor from double
+    explicit constexpr __device__
+    float8(const double v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+
+    // constructor from bool
+    explicit constexpr __device__
+    float8(const bool v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // convert to float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // NOLINT
+    // upcast using device specific intrinsic
+    inline constexpr __device__ operator float() const
+    {
+        if(__builtin_is_constant_evaluated() or !FNUZ)
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                return migraphx::fp8::impl::cast_from_f8<3, 4, float, FNUZ /*negative_zero_nan*/>(
+                    data);
+            } // else
+            return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
+        }
+        else
+        {
+            float fval      = 0;
+            uint32_t i32val = static_cast<uint32_t>(data);
+
+            // upcast
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                __asm__ volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+            }
+            else
+            {
+                __asm__ volatile("v_cvt_f32_bf8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+            }
+
+            return fval;
+        }
+    }
+
+#else // non gfx940
+    inline constexpr __device__ operator float() const
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            return migraphx::fp8::impl::cast_from_f8<3, 4, float, FNUZ /*negative_zero_nan*/>(data);
+        } // else
+        return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
+    }
+#endif
+
+    inline constexpr explicit __device__ operator bool() const { return not is_zero(); }
+
+    // check for zero
+    inline __device__ constexpr bool is_zero() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x00;
+        }
+        else
+        {
+            return (data == 0x00) or (data == 0x80);
+        }
+    }
+
+    // check for nan
+    inline __device__ constexpr bool is_nan() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7D) or (data == 0x7E) or (data == 0x7F) or (data == 0xFD) or
+                       (data == 0xFE) or (data == 0xFF);
+            }
+            else
+            {
+                return (data == 0x7F) or (data == 0xFF);
+            }
+        }
+    }
+
+    // check for inf
+    inline __device__ constexpr bool is_inf() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7C) or (data == 0xFC);
+            }
+            else
+            {
+                // no infinities in e4m3fn, represent them as NaNs
+                return (data == 0x7F) or (data == 0xFF);
+            }
+        }
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_SHORT_UNARY_OP(unary_op, binary_op)                              \
+    constexpr float8& __device__ operator unary_op(const float8& rhs)                 \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }                                                                                 \
+    constexpr float8& __device__ operator unary_op(const float& rhs)                  \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }
+
+    MIGRAPHX_FP8_SHORT_UNARY_OP(*=, *)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(-=, -)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(+=, +)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(/=, /)
+
+    inline __device__ constexpr float8& operator=(const float8& rhs)     = default;
+    inline __device__ constexpr float8& operator=(float8&& rhs) noexcept = default;
+
+    inline __device__ constexpr bool operator<(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we < them;
+    }
+
+    inline __device__ constexpr bool operator>(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we > them;
+    }
+};
+
+// https://onnx.ai/onnx/technical/float8.html
+using fp8e4m3fn   = float8<migraphx::fp8::f8_type::fp8, false>;
+using fp8e5m2     = float8<migraphx::fp8::f8_type::bf8, false>;
+using fp8e4m3fnuz = float8<migraphx::fp8::f8_type::fp8, true>;
+using fp8e5m2fnuz = float8<migraphx::fp8::f8_type::bf8, true>;
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_BINARY_OP(binary_op, T, U)                                  \
+    inline constexpr U __device__ operator binary_op(const T& lhs, const T& rhs) \
+    {                                                                            \
+        return U(static_cast<float>(lhs) binary_op static_cast<float>(rhs));     \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_OTHER_OPS(T)                                            \
+    inline constexpr __device__ T fabs(T v)                                  \
+    {                                                                        \
+        /*NOLINTNEXTLINE*/                                                   \
+        v.data = v.data & 0x7f;                                              \
+        return v;                                                            \
+    }                                                                        \
+    inline __device__ constexpr bool operator==(const T& lhs, const T& rhs)  \
+    {                                                                        \
+        if(rhs.is_nan() or rhs.is_inf() or lhs.is_nan() or lhs.is_inf())     \
+            return false;                                                    \
+        else if((rhs.is_zero() and lhs.is_zero()) or (lhs.data == rhs.data)) \
+            return true;                                                     \
+        return false;                                                        \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_GEN_OP_OVERLOADS(T) \
+    MIGRAPHX_FP8_BINARY_OP(*, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(-, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(/, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(+, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(>=, T, bool)  \
+    MIGRAPHX_FP8_BINARY_OP(<=, T, bool)  \
+    MIGRAPHX_FP8_BINARY_OP(!=, T, bool)  \
+    MIGRAPHX_FP8_OTHER_OPS(T)
+
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e5m2)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e5m2fnuz)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e4m3fn)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e4m3fnuz)
+
+template <>
+class numeric_limits<fp8e4m3fnuz>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e4m3fnuz epsilon()
+    {
+        return fp8e4m3fnuz(0x28, fp8e4m3fnuz::from_bits());
+    }
+    // NOLINTNEXTLINE
+    static constexpr __device__ fp8e4m3fnuz quiet_NaN()
+    {
+        return fp8e4m3fnuz(0x80, fp8e4m3fnuz::from_bits());
+    }
+
+    static constexpr __device__ fp8e4m3fnuz max()
+    {
+        return fp8e4m3fnuz(0x7F, fp8e4m3fnuz::from_bits());
+    }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01
+    static constexpr __device__ fp8e4m3fnuz min()
+    {
+        return fp8e4m3fnuz(0x08, fp8e4m3fnuz::from_bits());
+    }
+
+    static constexpr __device__ fp8e4m3fnuz lowest()
+    {
+        return fp8e4m3fnuz(0xFF, fp8e4m3fnuz::from_bits());
+    }
+};
+
+template <>
+class numeric_limits<fp8e4m3fn>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e4m3fn epsilon()
+    {
+        return fp8e4m3fn(0x20, fp8e4m3fn::from_bits());
+    }
+    // NOLINTNEXTLINE
+    static constexpr __device__ fp8e4m3fn quiet_NaN()
+    {
+        return fp8e4m3fn(0x7F, fp8e4m3fn::from_bits());
+    }
+
+    static constexpr __device__ fp8e4m3fn max() { return fp8e4m3fn(0x7E, fp8e4m3fn::from_bits()); }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01
+    static constexpr __device__ fp8e4m3fn min() { return fp8e4m3fn(0x08, fp8e4m3fn::from_bits()); }
+
+    static constexpr __device__ fp8e4m3fn lowest()
+    {
+        return fp8e4m3fn(0xFE, fp8e4m3fn::from_bits());
+    }
+};
+
+template <>
+class numeric_limits<fp8e5m2fnuz>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e5m2fnuz epsilon()
+    {
+        return fp8e5m2fnuz(0x34, fp8e5m2fnuz::from_bits());
+    }
+
+    static constexpr __device__ fp8e5m2fnuz quiet_NaN() // NOLINT
+    {
+        return fp8e5m2fnuz(0x80, fp8e5m2fnuz::from_bits());
+    }
+
+    static constexpr __device__ fp8e5m2fnuz max()
+    {
+        return fp8e5m2fnuz(0x7F, fp8e5m2fnuz::from_bits());
+    }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01.
+    static constexpr __device__ fp8e5m2fnuz min()
+    {
+        return fp8e5m2fnuz(0x4, fp8e5m2fnuz::from_bits());
+    }
+
+    static constexpr __device__ fp8e5m2fnuz lowest()
+    {
+        return fp8e5m2fnuz(0xFF, fp8e5m2fnuz::from_bits());
+    }
+};
+
+template <>
+class numeric_limits<fp8e5m2>
+{
+    public:
+    static constexpr bool has_infinity = true;
+    static constexpr __device__ fp8e5m2 epsilon() { return fp8e5m2(0x34, fp8e5m2::from_bits()); }
+    // 7D, 7E, 7F are positive NaNs and FD, FE, FF are negative NaNs
+    static constexpr __device__ fp8e5m2 quiet_NaN() // NOLINT
+    {
+        return fp8e5m2(0xFF, fp8e5m2::from_bits());
+    }
+
+    static constexpr __device__ fp8e5m2 max() { return fp8e5m2(0x7B, fp8e5m2::from_bits()); }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01.
+    static constexpr __device__ fp8e5m2 min() { return fp8e5m2(0x4, fp8e5m2::from_bits()); }
+
+    static constexpr __device__ fp8e5m2 lowest() { return fp8e5m2(0xFB, fp8e5m2::from_bits()); }
+    // 7C and FC both are infinity
+    static constexpr __device__ fp8e5m2 infinity() { return fp8e5m2(0x7C, fp8e5m2::from_bits()); }
+};
+
+} // namespace fp8
+template <class T,
+          MIGRAPHX_REQUIRES(is_same<T, fp8::fp8e4m3fnuz>{} or is_same<T, fp8::fp8e5m2fnuz>{} or
+                            is_same<T, fp8::fp8e4m3fn>{} or is_same<T, fp8::fp8e5m2>{})>
+constexpr T numeric_max(migraphx::fp8::f8_type unused = migraphx::fp8::f8_type::fp8)
+{
+    // unused parameter is added to make this numeric_max different overload definition
+    // compared to numeric_max defined in type_traits.hpp
+    (void)(unused);
+    return fp8::numeric_limits<T>::max();
+}
+template <class T,
+          MIGRAPHX_REQUIRES(is_same<T, fp8::fp8e4m3fnuz>{} or is_same<T, fp8::fp8e5m2fnuz>{} or
+                            is_same<T, fp8::fp8e4m3fn>{} or is_same<T, fp8::fp8e5m2>{})>
+constexpr T numeric_lowest(migraphx::fp8::f8_type unused = migraphx::fp8::f8_type::fp8)
+{
+    // unused parameter is added to make this numeric_lowest different overload definition
+    // compared to numeric_lowest defined in type_traits.hpp
+    (void)(unused);
+    return fp8::numeric_limits<T>::lowest();
+}
+} // namespace migraphx
+// =================================================================================================
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif // __clang__
+
+#endif // MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
new file mode 100644
index 000000000..2eca5ed4a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
@@ -0,0 +1,331 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#include <migraphx/kernels/bit_cast.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+namespace migraphx {
+
+namespace fp8 {
+namespace impl {
+
+// NOLINTBEGIN
+template <int Wm, int We, typename T, bool NegativeZeroNan, bool Clip>
+__device__ constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng = 0)
+{
+    constexpr bool is_float = true;
+    // half is not supported for now
+    constexpr bool is_half = false;
+    static_assert(Wm + We == 7, "Wm+We==7");
+    static_assert(is_float or is_half, "Only float can be cast to f8");
+
+    const uint32_t mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> x;
+
+    if constexpr(sizeof(T) == 4)
+        x = migraphx::bit_cast<uint32_t>(f_x);
+    else
+        x = migraphx::bit_cast<uint16_t>(f_x);
+
+    uint32_t head     = 0;
+    uint32_t mantissa = 0;
+    int exponent      = 0;
+    uint32_t bias     = 0;
+    uint32_t sign     = 0;
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+
+    uint32_t signed_inf      = (sign << 7) + (((1 << We) - 1) << Wm);
+    uint32_t signed_all_ones = (sign << 7) + ((((1 << We) - 1) << Wm) + ((1 << Wm) - 1));
+
+    // Calcualte maximum singed value FLT_MAX, FLT_MIN
+    uint32_t signed_max = signed_all_ones;
+    if(not NegativeZeroNan)
+        signed_max = (Wm == 2) ? (signed_max - 4) : (signed_max - 1);
+
+    // Deal with inf and NaNs
+    if(NegativeZeroNan) // For the FNUZ cases, it is simple just return NaNs
+    {
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+            return 0x80;
+    }
+    else
+    {
+        // calculate most common NaN mantissa for FP8, which is all Ones in binary
+        uint32_t nan_mantissa = 1;
+        for(auto i = 1; i < Wm; ++i)
+        {
+            nan_mantissa |= (nan_mantissa << 1);
+        }
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+        {
+            // infinity
+            if(mantissa == 0)
+            {
+                if(sign == 0)
+                    return (Wm == 2) ? 0x7B : 0x7E;
+                else
+                    return (Wm == 2) ? 0xFB : 0xFE;
+            }
+            else // NaNs
+                return signed_inf + nan_mantissa;
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero
+    }
+
+    /* First need to check if it is normal or denorm as there is a difference of implict 1
+    Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    exponent and mantissa again*/
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (We - 1u)) - 1 + (NegativeZeroNan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    /* act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    f8_exponent is the converted f8 exponent with bias encoding
+    exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    the difference needs to be adjusted and mantissa shifted*/
+    int act_exponent  = 0;
+    int f8_exponent   = 0;
+    int exponent_diff = 0;
+
+    if(exponent == 0 and mantissa != 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+        here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal
+        has exponent bias 15 while bf8 with FNUZ has exponent bias 16. It means that there are some
+        numbers in fp16 denormal but they are bf8 (FNUZ) normals - smallest bf8 (FNUZ) normal is
+        2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1
+        are bf8 (FNUZ) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = 1 - bias;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+            For example fp8 FNUZ mode, denormal exponent is -7, but if the fp32/fp16
+            actual exponent is -7, it is actually larger due to the implict 1,
+            Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+            So for fp32/fp16, exponent -8 is the cut point to convert to fp8 FNUZ */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    // need to know whether the number is right in the middle of two adjacent fp8 numbers. use  max
+    // value of 31 to avoid undefined behaviour
+    bool midpoint = (mantissa & ((1u << (mfmt - Wm + exponent_diff)) - 1)) ==
+                    (1u << (mfmt - Wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+    shift right as shift right could rip off some residual part and make something not midpoint look
+    like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+    midpoint, but after shift right by 4 bits, it would look like midpoint.
+    */
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - Wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - Wm)); // if the least significant bit that is not truncated is 1
+    /*
+    This part is doing rounding by adding mantissa part that is going to get dropped.
+    e.g. if the dropped part for less than 0.5 than it would round down.
+    if the dropped part is more than 0.5 then it would round up by rolling carry to LSB of retained
+    mantissa.
+    For the mid point when bit pattern is like this for Odd: `xy1:10000000` for Odd and
+    `xy0:10000000` for the Even.  where `:` is delimiter for dropped v/s retained part.
+    For the odd case :
+    this will add xy1:10000000 + 000:10000000 which would roll over carry to LSB of retained
+    part making it RNE.
+    For the even case : this will add xy0:10000000 + 000:01111111 which would
+    round down and keep number Even
+    */
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0 and ((1 << mfmt) & mantissa))
+    {
+        f8_exponent = 1; // denormal overflow to become normal, promote exponent
+    }
+    else if((1 << (mfmt + 1)) & mantissa)
+    {
+        mantissa >>= 1;
+        f8_exponent++;
+    }
+
+    mantissa >>= (mfmt - Wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    // for e5m2 case, max_exp is 14, since exp = 15 is reserved for Infs and Nans
+    const int max_exp = (1 << We) - ((NegativeZeroNan or Wm == 3) ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(Clip)
+            return signed_max;
+        else
+        {
+            // https://onnx.ai/onnx/technical/float8.html#cast
+            if(NegativeZeroNan)
+                return 0x80;
+            else
+                return (Wm == 2) ? signed_inf : signed_all_ones;
+        }
+    }
+
+    if(f8_exponent == 0 and mantissa == 0)
+        return NegativeZeroNan ? 0 : (sign << 7);
+    mantissa &= (1 << Wm) - 1;
+    return (sign << 7) | (f8_exponent << Wm) | mantissa;
+}
+// NOLINTEND
+
+template <int Wm, int We, typename T, bool NegativeZeroNan>
+__device__ constexpr T cast_from_f8(uint8_t x)
+{
+    // half is not supported for now
+    constexpr bool is_half  = false;
+    constexpr bool is_float = true;
+    static_assert(is_float or is_half, "Only float are supported");
+
+    constexpr int weo = is_half ? 5 : 8;
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
+    // NOLINTNEXTLINE
+    T f_inf, f_neg_inf, f_nan, f_neg0;
+
+    if constexpr(is_float)
+    {
+        const uint32_t if_inf     = 0x7F800000;
+        const uint32_t if_neg_inf = 0xFF800000;
+        const uint32_t if_nan     = 0x7F800001;
+        const uint32_t if_neg0    = 0x80000000;
+        f_inf                     = migraphx::bit_cast<float>(if_inf);
+        f_neg_inf                 = migraphx::bit_cast<float>(if_neg_inf);
+        f_nan                     = migraphx::bit_cast<float>(if_nan);
+        f_neg0                    = migraphx::bit_cast<float>(if_neg0);
+    }
+
+    if(x == 0)
+        return 0;
+
+    uint32_t sign     = x >> 7;              // NOLINT
+    uint32_t mantissa = x & ((1 << Wm) - 1); // NOLINT
+    int exponent      = (x & 0x7F) >> Wm;    // NOLINT
+    if(NegativeZeroNan)
+    {
+        if(x == 0x80)
+            return f_nan;
+    }
+    else
+    {
+        if(x == 0x80)
+            return f_neg0;
+        if(exponent == ((1 << We) - 1) and Wm == 2) // NOLINT
+            return (mantissa == 0) ? (sign ? f_neg_inf : f_inf) : f_nan;
+        else if(Wm == 3 and (x == 0x7F or x == 0xFF))
+            return f_nan;
+    }
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> retval;
+
+    const int exp_low_cutoff =
+        (1 << (weo - 1)) - (1 << (We - 1)) + 1 - (NegativeZeroNan ? 1 : 0); // NOLINT
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - Wm);
+        mantissa <<= sh;             // NOLINT
+        exponent += 1 - sh;
+        mantissa &= ((1 << Wm) - 1); // NOLINT
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - Wm; // NOLINT
+
+    // subnormal output (occurs when T=half, We=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;      // NOLINT
+        mantissa >>= 1 - exponent; // NOLINT
+        exponent = 0;
+    }
+
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa; // NOLINT
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa; // NOLINT
+    return migraphx::bit_cast<T>(retval);
+}
+} // namespace impl
+} // namespace fp8
+} // namespace migraphx
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/functional.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
new file mode 100644
index 000000000..3e9d80261
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -0,0 +1,389 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
+
+#include <migraphx/kernels/integral_constant.hpp>
+
+// Similiar to decltype(auto) except it will propagate any substitution failures
+// NOLINTNEXTLINE
+#define MIGRAPHX_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+
+// Lifts an expression into a function object so it can be passed to a higher-order function
+// NOLINTNEXTLINE
+#define MIGRAPHX_LIFT(...)                           \
+    [](auto&&... private_lifts_xs) MIGRAPHX_RETURNS( \
+        (__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_LIFT_CLASS(name, ...)                                                         \
+    struct name                                                                                \
+    {                                                                                          \
+        template <class... PrivateLiftTs>                                                      \
+        constexpr auto operator()(PrivateLiftTs&&... private_lifts_xs) const MIGRAPHX_RETURNS( \
+            (__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))       \
+    }
+
+namespace migraphx {
+
+struct swallow
+{
+    template <class... Ts>
+    constexpr swallow(Ts&&...)
+    {
+    }
+};
+
+template <index_int>
+using ignore = swallow;
+
+template <class... Fs>
+struct overloaded : Fs...
+{
+    using Fs::operator()...;
+    constexpr overloaded(Fs... fs) : Fs(fs)... {}
+};
+
+template <class... Fs>
+constexpr overloaded<Fs...> overload(Fs... fs)
+{
+    return {fs...};
+}
+
+namespace detail {
+
+template <class R>
+struct eval_helper
+{
+    R result;
+
+    template <class F, class... Ts>
+    constexpr eval_helper(const F& f, Ts&&... xs) : result(f(static_cast<Ts>(xs)...))
+    {
+    }
+};
+
+template <>
+struct eval_helper<void>
+{
+    int result;
+    template <class F, class... Ts>
+    constexpr eval_helper(const F& f, Ts&&... xs) : result((f(static_cast<Ts>(xs)...), 0))
+    {
+    }
+};
+
+template <index_int...>
+struct seq
+{
+    using type = seq;
+};
+
+template <class, class>
+struct merge_seq;
+
+template <index_int... Xs, index_int... Ys>
+struct merge_seq<seq<Xs...>, seq<Ys...>> : seq<Xs..., (sizeof...(Xs) + Ys)...>
+{
+};
+
+template <index_int N>
+struct gens : merge_seq<typename gens<N / 2>::type, typename gens<N - N / 2>::type>
+{
+};
+
+template <>
+struct gens<0> : seq<>
+{
+};
+template <>
+struct gens<1> : seq<0>
+{
+};
+
+template <class F, index_int... Ns>
+constexpr auto sequence_c_impl(F&& f, seq<Ns...>)
+{
+    return f(index_constant<Ns>{}...);
+}
+
+template <index_int... N>
+constexpr auto args_at(seq<N...>)
+{
+    return [](ignore<N>..., auto x, auto...) { return x; };
+}
+
+} // namespace detail
+
+template <class T>
+constexpr auto always(T x)
+{
+    return [=](auto&&...) { return x; };
+}
+
+template <index_int N, class F>
+constexpr auto sequence_c(F&& f)
+{
+    return detail::sequence_c_impl(f, detail::gens<N>{});
+}
+
+template <class IntegerConstant, class F>
+constexpr auto sequence(IntegerConstant ic, F&& f)
+{
+    return sequence_c<ic>(f);
+}
+
+template <class F, class G>
+constexpr auto by(F f, G g)
+{
+    return [=](auto... xs) {
+        return detail::eval_helper<decltype(g(f(xs)...))>{g, f(xs)...}.result;
+    };
+}
+
+template <class F>
+constexpr auto by(F f)
+{
+    return by([=](auto x) { return (f(x), 0); }, always(0));
+}
+
+template <class F, class... Ts>
+constexpr void each_args(F f, Ts&&... xs)
+{
+    swallow{(f(static_cast<Ts&&>(xs)), 0)...};
+}
+
+template <class F>
+constexpr void each_args(F)
+{
+}
+
+template <class F, class Pack>
+constexpr void unpack_each(F f)
+{
+    f();
+}
+
+template <class F, class Pack>
+constexpr void unpack_each(F f, Pack p)
+{
+    p([&](auto&&... xs) { each_args(f, static_cast<decltype(xs)>(xs)...); });
+}
+
+template <class F, class Pack1, class Pack2>
+constexpr void unpack_each(F f, Pack1 p1, Pack2 p2)
+{
+    p1([&](auto&&... xs) {
+        p2([&](auto&&... ys) {
+            each_args(
+                [&](auto&& p) { p(f); },
+                pack_forward(static_cast<decltype(xs)>(xs), static_cast<decltype(ys)>(ys))...);
+        });
+    });
+}
+
+template <class F, class Pack1, class Pack2, class... Packs>
+constexpr void unpack_each(F f, Pack1 p1, Pack2 p2, Packs... packs)
+{
+    unpack_each(
+        [&](auto&& x, auto&& y) {
+            unpack_each(
+                [&](auto&&... zs) {
+                    f(static_cast<decltype(x)>(x),
+                      static_cast<decltype(y)>(y),
+                      static_cast<decltype(zs)>(zs)...);
+                },
+                packs...);
+        },
+        p1,
+        p2);
+}
+
+template <index_int N, class F>
+constexpr void repeat_c(F&& f)
+{
+    sequence_c<N>([&](auto... xs) { each_args(f, xs...); });
+}
+
+template <class IntegerConstant, class F>
+constexpr auto repeat(IntegerConstant ic, F&& f)
+{
+    return repeat_c<ic>(f);
+}
+
+template <class F, class T>
+constexpr auto fold_impl(F&&, T&& x)
+{
+    return static_cast<T&&>(x);
+}
+
+template <class F, class T, class U, class... Ts>
+constexpr auto fold_impl(F&& f, T&& x, U&& y, Ts&&... xs)
+{
+    return fold_impl(f, f(static_cast<T&&>(x), static_cast<U&&>(y)), static_cast<Ts&&>(xs)...);
+}
+
+template <class F>
+constexpr auto fold(F f)
+{
+    return [=](auto&&... xs) { return fold_impl(f, static_cast<decltype(xs)&&>(xs)...); };
+}
+
+template <class... Fs>
+constexpr auto compose(Fs... fs)
+{
+    return fold([](auto f, auto g) {
+        return [=](auto&&... xs) { return f(g(static_cast<decltype(xs)>(xs)...)); };
+    })(fs...);
+}
+
+template <class F>
+constexpr auto partial(F f)
+{
+    return [=](auto... xs) {
+        return [=](auto&&... ys) { return f(xs..., static_cast<decltype(ys)>(ys)...); };
+    };
+}
+
+template <class... Ts>
+constexpr auto pack(Ts... xs)
+{
+    return [=](auto f) { return f(xs...); };
+}
+
+template <class... Ts>
+constexpr auto pack_forward(Ts&&... xs)
+{
+    return [&](auto f) { return f(static_cast<Ts&&>(xs)...); };
+}
+
+template <class G, class F>
+constexpr auto join(G g, F f)
+{
+    return f([=](auto... xs) { return g(xs...); });
+}
+
+template <class G, class F, class... Fs>
+constexpr auto join(G g, F f, Fs... fs)
+{
+    // return f1([=](auto x) { return f2([=](auto y) { return g(x, y); }); });
+    return f([=](auto... xs) { return join([=](auto... ys) { return g(xs..., ys...); }, fs...); });
+}
+
+template <class Compare, class P1, class P2>
+constexpr auto pack_compare(Compare compare, P1 p1, P2 p2)
+{
+    return p1([&](auto... xs) {
+        return p2([&](auto... ys) {
+            auto c = [&](auto x, auto y) -> int {
+                if(compare(x, y))
+                    return 1;
+                else if(compare(y, x))
+                    return -1;
+                else
+                    return 0;
+            };
+            return fold([](auto x, auto y) { return x ? x : y; })(c(xs, ys)..., 0);
+        });
+    });
+}
+
+template <index_int N>
+constexpr auto arg_c()
+{
+    return [](auto... xs) { return detail::args_at(detail::gens<N>{})(xs...); };
+}
+
+template <class IntegralConstant>
+constexpr auto arg(IntegralConstant ic)
+{
+    return arg_c<ic>();
+}
+
+template <class F>
+constexpr auto make_transform(F f)
+{
+    return [=](auto... xs) { return [=](auto g) { return f(g, xs...); }; };
+}
+
+// An arg transformation takes the arguments and then a function to take the new arguments:
+//     transform(xs...)([](auto... ys) { ... })
+// The transform_args function takes a list of transformations and continually applies them
+template <class F>
+constexpr auto transform_args(F f)
+{
+    return f;
+}
+
+template <class F, class... Fs>
+constexpr auto transform_args(F f, Fs... fs)
+{
+    return make_transform([=](auto g, auto... xs) {
+        return f(xs...)([=](auto... ys) { return transform_args(fs...)(ys...)(g); });
+    });
+}
+
+// identity transform
+inline constexpr auto transform_args()
+{
+    return make_transform([](auto f, auto... xs) { return f(xs...); });
+}
+
+// Rotate the last N arguments to the first N arguments
+template <index_int N>
+constexpr auto rotate_last()
+{
+    return make_transform([](auto f, auto... xs) {
+        return sequence_c<sizeof...(xs)>([&](auto... is) {
+            constexpr auto size = sizeof...(is);
+            return f(arg_c<(is + size - N) % size>()(xs...)...);
+        });
+    });
+}
+
+inline constexpr auto rotate_last() { return rotate_last<1>(); }
+
+// Pack the first N arguments
+template <index_int N>
+constexpr auto pack_first()
+{
+    return make_transform([](auto f, auto... xs) {
+        return sequence_c<N>([&](auto... is) {
+            return sequence_c<sizeof...(xs) - N>([&](auto... js) {
+                return f(pack(arg_c<is>()(xs...)...), arg_c<js + N>()(xs...)...);
+            });
+        });
+    });
+}
+
+// Rotate the last N arguments as the first argument packed
+template <index_int N>
+constexpr auto rotate_and_pack_last()
+{
+    return transform_args(rotate_last<N>(), pack_first<N>());
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gather.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
new file mode 100644
index 000000000..45f4ffcde
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
@@ -0,0 +1,64 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+#define MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <int Axis, class Input, class Indices>
+constexpr auto gather_shape(Input input, Indices indices)
+{
+    auto lengths = input.lens;
+
+    lengths[Axis] = indices.elements();
+    return make_shape(lengths, input.strides);
+}
+
+template <int Axis, class Input, class Indices, class Output>
+__device__ void gather(Input input, Indices indices, Output output)
+{
+    auto ind           = make_index();
+    auto axis_dim_size = input.get_shape().lens[Axis];
+
+    constexpr auto out_comp = gather_shape<Axis>(get_shape_c<Input>{}, get_shape_c<Indices>{});
+
+    ind.global_stride(output.get_shape().elements(), [&](auto i) {
+        auto idx      = out_comp.multi(i);
+        auto in_index = indices[idx[Axis]];
+
+        auto new_in_index = (in_index < 0) ? in_index + axis_dim_size : in_index;
+
+        idx[Axis] = new_in_index;
+
+        output[i] = input[idx];
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
new file mode 100644
index 000000000..325b7d34f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -0,0 +1,98 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GATHERND_HPP
+#define MIGRAPHX_GUARD_KERNELS_GATHERND_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/ops.hpp>
+namespace migraphx {
+
+template <class T>
+struct gathernd_settings
+{
+    T batch_dims{};
+};
+
+template <class... Ts>
+constexpr gathernd_settings<Ts...> make_gathernd_settings(Ts... xs)
+{
+    return {xs...};
+}
+
+template <class T, class U, class V, class Settings>
+__device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t, Settings s)
+{
+    auto ind           = make_index();
+    auto batch_dims    = s.batch_dims;
+    auto output_shape  = output_t.get_shape();
+    auto indices_shape = indices_t.get_shape();
+    auto data_shape    = data_t.get_shape();
+
+    auto indices_shape_lens = indices_shape.lens;
+    auto data_shape_lens    = data_shape.lens;
+    auto num_slice_dims     = indices_shape_lens.back();
+    size_t num_slices =
+        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
+    size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
+                                   data_shape_lens.end(),
+                                   1,
+                                   op::product{});
+    const size_t num_batches =
+        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
+    const size_t data_batch_stride =
+        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
+    const auto num_slices_per_batch = num_slices / num_batches;
+
+    ind.global_stride(output_shape.elements(), [&](auto i) {
+        const auto* indices_ptr     = indices_t.data();
+        const size_t j              = i / slice_size;
+        const size_t batch_idx      = j / num_slices_per_batch;
+
+        auto* slice_indices               = indices_ptr + (j * num_slice_dims);
+        size_t relative_slice_offset      = 0;
+        for(size_t idx = 0; idx < num_slice_dims; ++idx)
+        {
+            int64_t index                   = slice_indices[idx];
+            const size_t input_dim_idx      = batch_dims + idx;
+            const auto input_dim            = data_shape_lens[input_dim_idx];
+            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
+                            index < static_cast<int64_t>(input_dim));
+            if(index < 0)
+                index += input_dim;
+            size_t size_from_slice_dims =
+                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
+                           data_shape_lens.begin() + batch_dims + num_slice_dims,
+                           slice_size,
+                           op::product{});
+            relative_slice_offset += index * size_from_slice_dims;
+        }
+
+        auto slice_offset = (batch_idx * data_batch_stride) + relative_slice_offset;
+        output_t[i]       = data_t[slice_offset + i % slice_size];
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
new file mode 100644
index 000000000..d219786c6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
@@ -0,0 +1,92 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
+#define MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
+
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/index.hpp>
+
+namespace migraphx {
+
+template <class Tensor>
+constexpr auto gemm_get_batches()
+{
+    constexpr auto lens     = get_shape_c<Tensor>{}.lens;
+    constexpr auto strides  = get_shape_c<Tensor>{}.strides;
+    constexpr auto new_lens = sequence(
+        lens.size() - _c<2>, [&](auto... is) { return make_const_array(_c<lens[is]>...); });
+    constexpr auto new_strides = sequence(
+        strides.size() - _c<2>, [&](auto... is) { return make_const_array(_c<strides[is]>...); });
+    return make_shape(new_lens, new_strides);
+}
+
+template <class Tensor>
+constexpr auto gemm_get_matrix()
+{
+    constexpr auto lens        = get_shape_c<Tensor>{}.lens;
+    constexpr auto strides     = get_shape_c<Tensor>{}.strides;
+    constexpr auto m           = lens.size() - _c<2>;
+    constexpr auto n           = lens.size() - _c<1>;
+    constexpr auto new_lens    = make_const_array(_c<lens[m]>, _c<lens[n]>);
+    constexpr auto new_strides = make_const_array(_c<strides[m]>, _c<strides[n]>);
+    return make_shape(new_lens, new_strides);
+}
+
+template <class Tensor, class T>
+constexpr auto gemm_batch_slice(Tensor t, T i)
+{
+    constexpr auto batch  = gemm_get_batches<Tensor>();
+    constexpr auto matrix = gemm_get_matrix<Tensor>();
+    MIGRAPHX_ASSERT((batch.index(i) + matrix.element_space()) <= t.get_shape().element_space());
+    return make_tensor_view(t.data() + batch.index(i), matrix);
+}
+
+template <class BlocksPerBatch, class T, class... Ts>
+constexpr auto gemm_batch_args(index idx, BlocksPerBatch bpb, T x, Ts... xs)
+{
+    return [=](auto f) {
+        // All tensors should have the same rank
+        static_assert(
+            (true and ... and (get_shape_c<T>{}.lens.size() == get_shape_c<Ts>{}.lens.size())));
+        if constexpr(get_shape_c<T>{}.lens.size() > 2)
+        {
+            // Get the first batch since all batches should have the same number of elements
+            constexpr auto batch = gemm_get_batches<T>();
+            static_assert(
+                (true and ... and (batch.elements() == gemm_get_batches<Ts>().elements())));
+            idx.group_stride(bpb * batch.elements(), [&](auto gidx) {
+                const auto batch_idx = gidx / bpb;
+                f(gemm_batch_slice(x, batch_idx), gemm_batch_slice(xs, batch_idx)...);
+            });
+        }
+        else
+        {
+            f(x, xs...);
+        }
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/generic_constant.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/generic_constant.hpp
new file mode 100644
index 000000000..a1c2c9f82
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/generic_constant.hpp
@@ -0,0 +1,56 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
+#define MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
+
+namespace migraphx {
+
+template <class F>
+struct generic_constant
+{
+    static constexpr auto value = F{}();
+    using value_type            = decltype(value);
+    using type                  = generic_constant;
+    constexpr operator value_type() const noexcept { return value; }
+    constexpr value_type operator()() const noexcept { return value; }
+};
+
+template <class F>
+constexpr generic_constant<F> make_generic_constant(F)
+{
+    return {};
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_MAKE_CONSTANT(x)                           \
+    make_generic_constant([] {                              \
+        struct fun                                          \
+        {                                                   \
+            constexpr auto operator()() const { return x; } \
+        };                                                  \
+        return fun{};                                       \
+    }())
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_rotary_embedding.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_rotary_embedding.hpp
new file mode 100644
index 000000000..6fd1b15f3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_rotary_embedding.hpp
@@ -0,0 +1,178 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_ROTARY_EMBEDDING_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROTARY_EMBEDDING_HPP
+
+#include <migraphx/kernels/group_query_attention.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <class Input, class CosCache, class SinCache, class Output, class PosIDs, class Params>
+__device__ void run_rotary_embedding(Input input,
+                                     CosCache cos_cache,
+                                     SinCache sin_cache,
+                                     Output output,
+                                     PosIDs pos_ids,
+                                     Params params,
+                                     index_int idx,
+                                     bool is_query = false)
+{
+    const index_int batch_size          = params.batch_size;
+    const index_int sequence_length     = params.sequence_length;
+    const index_int n_heads             = is_query ? params.num_heads : params.kv_num_heads;
+    const index_int head_size           = params.head_size;
+    const index_int head_stride         = params.head_stride;
+    const index_int seq_stride          = params.seq_stride;
+    const index_int batch_stride        = params.batch_stride;
+    const int position_ids_format       = params.position_ids_format;
+    const index_int rotary_emb_dim      = params.rotary_embedding_dim;
+    const index_int half_rotary_emb_dim = rotary_emb_dim / 2;
+
+    const index_int loop_len = batch_size * sequence_length * n_heads;
+    const index_int i        = idx / head_size;
+    const index_int ii       = idx % head_size;
+    if(i < loop_len)
+    {
+        const index_int b            = (i / n_heads) / sequence_length;
+        const index_int s            = (i / n_heads) % sequence_length;
+        const index_int n            = i % n_heads;
+        const index_int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
+        auto input_data              = input + block_offset;
+        auto output_data             = output + block_offset;
+
+        // Cache is (M, H/2) or (M, rotary_embedding_dim/2)
+        int position_id = (position_ids_format == 0)
+                              ? static_cast<int>(pos_ids[0]) + s
+                              : static_cast<int>(pos_ids[b * sequence_length + s]);
+        position_id     = (sequence_length == 1) ? position_id : s;
+
+        const index_int cache_offset = position_id * half_rotary_emb_dim;
+        auto cos_data                = cos_cache + cache_offset;
+        auto sin_data                = sin_cache + cache_offset;
+
+        int cache_idx = 0;
+        double sign   = 0.0;
+        int j         = 0;
+        if(ii < rotary_emb_dim)
+        {
+            if(params.rotary_interleaved)
+            {
+                cache_idx = (ii / 2) % half_rotary_emb_dim;
+                sign      = (ii % 2 == 0) ? -1.0 : 1.0;
+                j         = (ii % 2 == 0) ? ii + 1 : ii - 1; // i - sign
+            }
+            else
+            {
+                cache_idx = ii % half_rotary_emb_dim;
+                sign      = (ii < half_rotary_emb_dim) ? -1.0 : 1.0;
+                j         = (ii + half_rotary_emb_dim) % rotary_emb_dim;
+            }
+            double out_data =
+                static_cast<double>(input_data[ii]) * static_cast<double>(cos_data[cache_idx]) +
+                sign * static_cast<double>(input_data[j]) *
+                    static_cast<double>(sin_data[cache_idx]);
+            output_data[ii] = out_data;
+        }
+        else if(ii < head_size)
+        {
+            output_data[ii] = input_data[ii];
+        }
+    }
+}
+
+template <class Params, class Input, class Output>
+__device__ void
+pack_v_into_rotary_qkv(Params params, const Input input, Output output, index_int idx)
+{
+    const index_int loop_len = params.batch_size * params.sequence_length * params.kv_num_heads;
+    auto i                   = idx / params.head_size;
+    auto ii                  = idx % params.head_size;
+    if(i < loop_len)
+    {
+        const index_int b = (i / params.kv_num_heads) / params.sequence_length;
+        const index_int s = (i / params.kv_num_heads) % params.sequence_length;
+        const index_int n = i % params.kv_num_heads;
+        const index_int block_offset =
+            b * params.batch_stride + s * params.seq_stride + n * params.head_stride;
+        const Input input_data = input + block_offset;
+        Output output_data     = output + block_offset;
+        if(ii < params.head_size)
+        {
+            output_data[ii] = input_data[ii];
+        }
+    }
+}
+
+template <class Output, class Query, class SeqLensK, class CosCache, class SinCache, class Params>
+__device__ void gqa_rotary_embedding(Output output,
+                                     Query query,
+                                     SeqLensK seqlens_k,
+                                     CosCache cos_cache,
+                                     SinCache sin_cache,
+                                     Params params)
+{
+    auto ind = make_index();
+    ind.global_stride(output.get_shape().elements(), [&](auto idx) {
+        auto q_input  = query.begin();
+        auto q_rotary = output.begin();
+        auto k_input  = q_input + params.num_heads * params.sequence_length * params.head_size;
+        auto k_rotary = q_rotary + params.num_heads * params.sequence_length * params.head_size;
+        auto v_input  = k_input + params.kv_num_heads * params.sequence_length * params.head_size;
+        auto v_rotary = k_rotary + params.kv_num_heads * params.sequence_length * params.head_size;
+        auto q_chunk_size =
+            params.batch_size * params.num_heads * params.sequence_length * params.head_size;
+        auto kv_chunk_size =
+            params.batch_size * params.kv_num_heads * params.sequence_length * params.head_size;
+        if(idx < q_chunk_size)
+        {
+            run_rotary_embedding(q_input,
+                                 cos_cache.begin(),
+                                 sin_cache.begin(),
+                                 q_rotary,
+                                 seqlens_k.begin(),
+                                 params,
+                                 idx,
+                                 true);
+        }
+        else if(idx < q_chunk_size + kv_chunk_size)
+        {
+            run_rotary_embedding(k_input,
+                                 cos_cache.begin(),
+                                 sin_cache.begin(),
+                                 k_rotary,
+                                 seqlens_k.begin(),
+                                 params,
+                                 idx - q_chunk_size);
+        }
+        else if(idx < output.get_shape().elements())
+        {
+            pack_v_into_rotary_qkv(params, v_input, v_rotary, idx - (q_chunk_size + kv_chunk_size));
+        }
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_softmax.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_softmax.hpp
new file mode 100644
index 000000000..27e2154b6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/gqa_softmax.hpp
@@ -0,0 +1,138 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GQA_SOFTMAX_HPP
+#define MIGRAPHX_GUARD_KERNELS_GQA_SOFTMAX_HPP
+
+#include <migraphx/kernels/group_query_attention.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <class T>
+__device__ void softmax_inplace(T score, int n, int d)
+{
+    for(int j = 0; j < n; ++j)
+    {
+        auto x = score + j * d;
+        auto y = x;
+
+        // e^x is represented as infinity if x is large enough, like 100.f.
+        // Infinity divided by Infinity is a NAN. Thus, softmax gets a NAN if
+        // one or more item are large enough. a math transform as below is
+        // leveraged to get a stable softmax: e^xi/(e^x1 + ...e^xn) = e^(xi -
+        // max) / (e^(x1 - max) + ... + e^(xn - max))
+        float max = -numeric_max<float>();
+        for(int i = 0; i < d; i++)
+        {
+            if(max < x[i])
+                max = x[i];
+        }
+        for(int i = 0; i < d; i++)
+        {
+            y[i] = expf(x[i] - max);
+        }
+
+        float sum = 0.0;
+        for(int i = 0; i < d; i++)
+        {
+            sum += x[i];
+        }
+
+        for(int i = 0; i < d; i++)
+        {
+            y[i] = x[i] / static_cast<float>(sum);
+        }
+    }
+}
+
+template <class AttnProbs,
+          class SeqLensK,
+          class Params>
+__device__ void calculate_softmax(AttnProbs attention_probs, // output buffer with size BxNxSxT
+                                  SeqLensK seqlens_k,        // past sequence lengths tensor
+                                  Params params,
+                                  index_int idx)
+{
+    const index_int batch_size                     = params.batch_size;
+    const index_int sequence_length                = params.sequence_length;
+    const index_int num_heads                      = params.num_heads;
+    const index_int present_buffer_sequence_length = params.seqlen_present_kv_cache;
+
+    const index_int loop_len = batch_size * num_heads;
+    const index_int i        = idx / sequence_length;
+    const index_int inner_i  = idx % sequence_length;
+    if(i < loop_len)
+    {
+        const index_int batch_index   = i / num_heads;
+        const index_int total_seqlen  = seqlens_k[batch_index] + 1;
+        const index_int output_offset = i * sequence_length * present_buffer_sequence_length;
+        auto output                   = attention_probs + output_offset;
+
+        const int local_window_size = params.local_window_size;
+        auto output_softmax         = output;
+        index_int seq               = inner_i;
+        if(seq < sequence_length)
+        {
+            output_softmax += seq * present_buffer_sequence_length;
+            auto consume = total_seqlen + local_window_size;
+            seq += consume;
+            seq -= consume;
+            int seq_causal_length = sequence_length == 1 ? total_seqlen : seq + 1;
+            if(local_window_size > 0 and seq_causal_length > local_window_size + 1)
+            {
+                for(int total_seq_id = 0; total_seq_id < seq_causal_length - local_window_size - 1;
+                    total_seq_id++)
+                {
+                    output_softmax[total_seq_id] = 0.f;
+                }
+                softmax_inplace(output_softmax + seq_causal_length - local_window_size - 1,
+                                1,
+                                local_window_size + 1);
+            }
+            else
+            {
+                softmax_inplace(output_softmax, 1, seq_causal_length);
+            }
+            for(int total_seq_id = seq_causal_length; total_seq_id < total_seqlen; total_seq_id++)
+            {
+                output_softmax[total_seq_id] = 0.f;
+            }
+        }
+    }
+}
+
+template <class Output, class Input, class PresentKey, class Probs, class SeqLensK, class Params>
+__device__ void
+gqa_softmax(Output output, Input, PresentKey, Probs, SeqLensK seqlens_k, Params params)
+{
+    const index_int elements = params.batch_size * params.num_heads * params.sequence_length;
+    auto ind                 = make_index();
+    ind.global_stride(elements, [&](auto idx) {
+        calculate_softmax(output.begin(), seqlens_k.begin(), params, idx);
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/group_query_attention.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/group_query_attention.hpp
new file mode 100644
index 000000000..dbb60e7bd
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/group_query_attention.hpp
@@ -0,0 +1,122 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GROUP_QUERY_ATTENTION_HPP
+#define MIGRAPHX_GUARD_KERNELS_GROUP_QUERY_ATTENTION_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+
+namespace migraphx {
+
+template <class T1,
+          class T2,
+          class T3,
+          class T4,
+          class T5,
+          class T6,
+          class T7,
+          class T8,
+          class T9,
+          class T10,
+          class T11,
+          class T12,
+          class T13,
+          class T14,
+          class T15,
+          class T16,
+          class T17,
+          class T18>
+struct gqa_parameters
+{
+    T1 scale;
+    T2 batch_size;               // Batch size used by input
+    T3 sequence_length;          // Sequence length used by input
+    T4 hidden_size;              // Hidden size used by input
+    T5 head_size;                // Head size
+    T6 rotary_embedding_dim;     // Rotary embedding dimension.
+    T7 num_heads;                // num_heads = hidden_size / head_size
+    T8 max_sequence_length;      // Sequence length used by cos/sin cache
+    T9 head_stride;              // Head stride
+    T10 seq_stride;              // Sequence stride
+    T11 batch_stride;            // Batch stride
+    T12 position_ids_format;     // Format of position ids - 0 is (1), 1 is (batch_size,
+                                 // sequence_length)
+    T13 seqlen_present_kv_cache; // Sequence length of present kv-cache (4096 when using
+                                 // shared buffer)
+    T14 do_rotary;               // Whether to use rotary position embedding. Default value is 0.
+    T15 kv_num_heads;            // Number of attention heads for k and v
+    T16 local_window_size;  // left_window_size for local attention. Default value is -1 meaning
+                            // unused.
+    T17 rotary_interleaved; // Rotate using interleaved pattern. Default value is 0 (False).
+    T18 past_present_share_buffer; // Whether to use same buffer for KV-cache inputs and outputs
+};
+
+template <class... Ts>
+__device__ gqa_parameters<Ts...> make_gqa_parameters(Ts... ts)
+{
+    return {ts...};
+}
+
+struct naive_gemm
+{
+    index_int max_m;
+    index_int max_n;
+    index_int max_k;
+    index_int lda;
+    index_int ldb;
+    index_int ldc;
+    bool b_transpose;
+    float alpha;
+    float beta;
+
+    template <class C, class A, class B>
+    __device__ void compute(C cmat, const A amat, const B bmat, const index_int idx)
+    {
+        auto m     = idx / max_n;
+        auto n     = idx % max_n;
+        auto index = [&](auto x, auto y, auto z) { return y + (x * z); };
+
+        if(m < max_m)
+        {
+            if(n < max_n)
+            {
+                double s = 0.0;
+                for(int k = 0; k < max_k; ++k)
+                {
+                    auto a_i = index(m, k, lda);
+                    auto b_i = b_transpose ? index(n, k, ldb) : index(k, n, ldb);
+                    s += static_cast<double>(amat[a_i]) * static_cast<double>(bmat[b_i]);
+                }
+                auto c_i  = index(m, n, ldc);
+                cmat[c_i] = static_cast<double>(alpha) * s + cmat[c_i] * static_cast<double>(beta);
+            }
+        }
+    }
+};
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/hip.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
new file mode 100644
index 000000000..8ddc7ad0e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -0,0 +1,33 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
+#define MIGRAPHX_GUARD_KERNELS_HIP_HPP
+
+#ifndef MIGRAPHX_USE_HIPRTC
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/math_functions.h>
+#endif
+
+#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/index.hpp
new file mode 100644
index 000000000..9c43f5d3b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -0,0 +1,309 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_INDEX_HPP
+#define MIGRAPHX_GUARD_KERNELS_INDEX_HPP
+
+#include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/functional.hpp>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT
+extern "C" __device__ size_t __ockl_get_local_size(uint);          // NOLINT
+#pragma clang diagnostic pop
+#endif
+
+namespace migraphx {
+
+#if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
+#define MIGRAPHX_NGROUP ((MIGRAPHX_NGLOBAL + MIGRAPHX_NLOCAL - 1) / MIGRAPHX_NLOCAL)
+#endif
+
+inline __device__ __attribute__((const)) index_int compute_global_size()
+{
+#ifdef MIGRAPHX_NGLOBAL
+    return MIGRAPHX_NGLOBAL;
+#else
+    // This actually works even when global is not divisible by local size.
+    // This doesnt actually do a multiplication. Instead it calls a device
+    // function to get the global size, which is why it works.
+    return blockDim.x * gridDim.x; // NOLINT
+#endif
+}
+
+#ifdef MIGRAPHX_NGROUP
+// If global is divisible by local then local can be a const
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
+
+inline __device__ __attribute__((const)) index_int compute_local_size()
+{
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    return MIGRAPHX_NLOCAL;
+#else
+    // Returns block size. For the non-uniform block it returns the size of the non-uniform block.
+    return __ockl_get_local_size(0); // NOLINT
+#endif
+}
+
+inline __device__ __attribute__((const)) index_int compute_max_local_size()
+{
+#ifdef MIGRAPHX_LOCAL
+    return MIGRAPHX_NLOCAL;
+#else
+    // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform
+    // block.
+    return __ockl_get_enqueued_local_size(0); // NOLINT
+#endif
+}
+
+struct index
+{
+    index_int global = 0;
+    index_int local  = 0;
+    index_int group  = 0;
+
+#ifdef MIGRAPHX_NGLOBAL
+    constexpr index_constant<MIGRAPHX_NGLOBAL> nglobal() const
+    {
+        static_assert(MIGRAPHX_NGLOBAL > 0, "Global size must be greater than 0");
+        return {};
+    }
+#else
+    __device__ index_int nglobal() const
+    {
+        MIGRAPHX_ASSERT(compute_global_size() > 0);
+        return compute_global_size(); // NOLINT
+    }
+#endif
+
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    constexpr index_constant<MIGRAPHX_NLOCAL> nlocal() const
+    {
+        static_assert(MIGRAPHX_NLOCAL > 0, "Local size must be greater than 0");
+        return {};
+    }
+#else
+    __device__ index_int nlocal() const
+    {
+#ifdef MIGRAPHX_NGROUP
+        static_assert((MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL != 0) and (MIGRAPHX_NGROUP > 1),
+                      "Local size should be const");
+#endif
+        MIGRAPHX_ASSERT(compute_local_size() > 0);
+        return compute_local_size(); // NOLINT
+    }
+#endif
+
+#ifdef MIGRAPHX_NLOCAL
+    constexpr index_constant<MIGRAPHX_NLOCAL> max_nlocal() const { return {}; }
+#else
+    __device__ index_int max_nlocal() const
+    {
+        MIGRAPHX_ASSERT(compute_max_local_size() > 0);
+        return compute_max_local_size();
+    }
+#endif
+
+    constexpr auto ngroup() const { return nglobal() / max_nlocal(); }
+
+    template <unsigned int SubWaveSize>
+    constexpr index_constant<SubWaveSize> nlocal_subwave() const
+    {
+        return {};
+    }
+    template <unsigned int SubWaveSize>
+    constexpr auto local_subwave() const
+    {
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+        if constexpr(decltype(nlocal()){} == SubWaveSize)
+            return local;
+#endif
+        return local % nlocal_subwave<SubWaveSize>();
+    }
+    template <unsigned int SubWaveSize>
+    constexpr auto nwave() const
+    {
+        return max_nlocal() / nlocal_subwave<SubWaveSize>();
+    }
+
+    constexpr index_constant<MIGRAPHX_WAVEFRONTSIZE> nlocal_wave() const { return {}; }
+    constexpr auto local_wave() const { return local % nlocal_wave(); }
+    constexpr auto nwave() const { return max_nlocal() / nlocal_wave(); }
+    constexpr auto wave() const { return local / nlocal_wave(); }
+
+    template <class N, class Stride>
+    static constexpr auto max_stride_iterations(N n, Stride stride)
+    {
+        return (n - _c<1>) / stride + _c<1>;
+    }
+
+    template <class N>
+    constexpr auto max_global_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nglobal());
+    }
+
+    template <class N>
+    constexpr auto max_local_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nlocal());
+    }
+
+    template <class N>
+    constexpr auto max_local_wave_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nlocal_wave());
+    }
+
+    template <unsigned int SubWaveSize, class N>
+    constexpr auto max_local_subwave_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nlocal_subwave<SubWaveSize>());
+    }
+
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D d) -> decltype(f(i, d))
+    {
+        return f(i, d);
+    }
+
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D) -> decltype(f(i))
+    {
+        return f(i);
+    }
+
+    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop_unroll(index_int start, N n, Stride stride, F f)
+    {
+        sequence(max_stride_iterations(n, stride), [&](auto... ks) {
+            fold([&](auto d, auto k) {
+                auto i = start + stride * k;
+                if(i < n)
+                    invoke_loop(f, i, d);
+                return d + _c<1>;
+            })(_c<0>, ks...);
+        });
+    }
+
+    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop(index_int start, N n, Stride stride, F f)
+    {
+        index_int k = 0;
+        for(index_int i = start; i < n; i += stride)
+        {
+            invoke_loop(f, i, k);
+            k++;
+        }
+    }
+
+    template <bool Unroll, class F, class N, class Stride>
+    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
+    {
+        MIGRAPHX_ASSERT(start < stride);
+        if constexpr(not is_integral<N>{} and not is_integral<Stride>{})
+        {
+            if constexpr(max_stride_iterations(n, stride) == 1)
+            {
+                if constexpr(stride > n)
+                {
+                    if(start < n)
+                        invoke_loop(f, start, _c<0>);
+                }
+                else
+                {
+                    invoke_loop(f, start, _c<0>);
+                }
+            }
+            else if constexpr(Unroll)
+            {
+                MIGRAPHX_STATIC_ASSERT_FOR(max_stride_iterations(n, stride) < 256)
+                {
+                    for_stride_loop_unroll(start, n, stride, f);
+                }
+            }
+            else
+            {
+                for_stride_loop(start, n, stride, f);
+            }
+        }
+        else
+        {
+            for_stride_loop(start, n, stride, f);
+        }
+    }
+
+    template <class F, class N>
+    __device__ void global_stride(N n, F f) const
+    {
+        for_stride<false>(global, n, nglobal(), f);
+    }
+
+    template <class F, class N>
+    __device__ void local_stride(N n, F f) const
+    {
+        for_stride<true>(local, n, nlocal(), f);
+    }
+
+    template <class F, class N>
+    __device__ void group_stride(N n, F f) const
+    {
+        for_stride<false>(group, n, ngroup(), f);
+    }
+
+    template <unsigned int SubWaveSize, class F, class N>
+    __device__ void local_subwave_stride(N n, F f) const
+    {
+        for_stride<true>(local_subwave<SubWaveSize>(), n, nlocal_subwave<SubWaveSize>(), f);
+    }
+
+    template <class F, class N>
+    __device__ void local_wave_stride(N n, F f) const
+    {
+        for_stride<true>(local_wave(), n, nlocal_wave(), f);
+    }
+};
+
+#ifdef MIGRAPHX_NLOCAL
+#define MIGRAPHX_GLOBAL \
+    __global__ __attribute__((amdgpu_flat_work_group_size(MIGRAPHX_NLOCAL, MIGRAPHX_NLOCAL)))
+#else
+#define MIGRAPHX_GLOBAL __global__
+#endif
+inline __device__ __attribute__((const)) index make_index()
+{
+    return index{
+        blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_INDEX_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
new file mode 100644
index 000000000..63807ff78
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -0,0 +1,103 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
+#define MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
+
+#include <migraphx/kernels/types.hpp>
+
+namespace migraphx {
+
+template <class T, T V>
+struct integral_constant
+{
+    static constexpr T value = V;
+    using value_type         = T;
+    using type               = integral_constant;
+    constexpr operator value_type() const noexcept { return value; }
+    constexpr value_type operator()() const noexcept { return value; }
+    static constexpr type to() { return {}; }
+};
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(op)                                \
+    template <class T, T V, class U, U w>                                       \
+    constexpr inline integral_constant<decltype(V op w), (V op w)> operator op( \
+        integral_constant<T, V>, integral_constant<U, w>) noexcept              \
+    {                                                                           \
+        return {};                                                              \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(op)                             \
+    template <class T, T V>                                                 \
+    constexpr inline integral_constant<decltype(op V), (op V)> operator op( \
+        integral_constant<T, V>) noexcept                                   \
+    {                                                                       \
+        return {};                                                          \
+    }
+
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(+)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(-)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(*)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(/)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(%)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>>)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<<)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(==)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(!=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(^)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(|)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(and)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(or)
+
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(not )
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(~)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(+)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(-)
+
+template <bool B>
+using bool_constant = integral_constant<bool, B>;
+
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
+
+template <index_int N>
+using index_constant = integral_constant<index_int, N>;
+
+template <auto V>
+static constexpr auto _c = integral_constant<decltype(V), V>{}; // NOLINT
+
+template <class F>
+constexpr auto return_c(F f)
+{
+    return _c<f()>;
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
new file mode 100644
index 000000000..c04522778
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
@@ -0,0 +1,168 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
+#define MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
+
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+
+namespace migraphx {
+
+template <class F, class Iterator = diff_int>
+struct basic_iota_iterator
+{
+    Iterator index;
+    F f;
+
+    using difference_type = diff_int;
+    using reference       = decltype(f(declval<Iterator>()));
+    using value_type      = remove_reference_t<reference>;
+    using pointer         = add_pointer_t<value_type>;
+
+    constexpr basic_iota_iterator& operator+=(diff_int n)
+    {
+        index += n;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator-=(diff_int n)
+    {
+        index -= n;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator++()
+    {
+        index++;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator& operator--()
+    {
+        index--;
+        return *this;
+    }
+
+    constexpr basic_iota_iterator operator++(int) // NOLINT
+    {
+        basic_iota_iterator it = *this;
+        index++;
+        return it;
+    }
+
+    constexpr basic_iota_iterator operator--(int) // NOLINT
+    {
+        basic_iota_iterator it = *this;
+        index--;
+        return it;
+    }
+    // TODO: operator->
+    constexpr reference operator*() const { return f(index); }
+
+    constexpr reference operator[](MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_int) x) const
+    {
+        return f(capture_transform(x, [&](auto y) { return index + y; }));
+    }
+};
+
+template <class T, class F>
+constexpr basic_iota_iterator<F, T> make_basic_iota_iterator(T x, F f)
+{
+    return basic_iota_iterator<F, T>{x, f};
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator+(basic_iota_iterator<F, Iterator> x, diff_int y)
+{
+    return x += y;
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator+(diff_int x, basic_iota_iterator<F, Iterator> y)
+{
+    return y + x;
+}
+
+template <class F, class Iterator>
+constexpr diff_int operator-(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index - y.index;
+}
+
+template <class F, class Iterator>
+constexpr basic_iota_iterator<F, Iterator> operator-(basic_iota_iterator<F, Iterator> x, diff_int y)
+{
+    return x -= y;
+}
+
+template <class F, class Iterator>
+constexpr bool operator==(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index == y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator!=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index != y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator<(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index < y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator>(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index > y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator>=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index >= y.index;
+}
+
+template <class F, class Iterator>
+constexpr bool operator<=(basic_iota_iterator<F, Iterator> x, basic_iota_iterator<F, Iterator> y)
+{
+    return x.index <= y.index;
+}
+
+struct defaul_iota_iterator
+{
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+using iota_iterator = basic_iota_iterator<defaul_iota_iterator>;
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_IOTA_ITERATOR_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
new file mode 100644
index 000000000..c64ab5531
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -0,0 +1,112 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
+#define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/vec.hpp>
+#include <migraphx/kernels/print.hpp>
+
+namespace migraphx {
+
+template <typename T>
+struct acc_type
+{
+    using type = float;
+};
+
+template <>
+struct acc_type<double>
+{
+    using type = double;
+};
+
+template <class T, index_int N, class Op>
+constexpr auto vec_reduce(const array<T, N>& a, Op op)
+{
+    return a.apply([&](auto x) { return vec_reduce(x, op); });
+}
+
+template <index_int Axis,
+          class F,
+          class BinOp,
+          class Output,
+          class Input1,
+          class Input2,
+          class... Inputs>
+__device__ void generic_binary_layernorm(
+    F compute, BinOp op, float eps, Output output, Input1 input1, Input2 input2, Inputs... inputs)
+{
+    using block         = reduce::auto_block<reduce::reduce_elements_with_axis<Input1, Axis>()>;
+    using reduce_output = reduce::with_axis<Input1, Axis>;
+
+    block::template run<reduce_output>([&](auto, auto r) {
+        using value_type     = typename Input1::type;
+        using vec_value_type = typename acc_type<vec_type<value_type>>::type;
+
+        auto input = r.inner([&](auto x1, auto x2) {
+            return migraphx::convert<vec_value_type>(op(x1, x2));
+        })(input1, input2);
+
+        constexpr auto relements   = r.template elements<Input1>();
+        constexpr auto relements_r = vec_value_type{1.0 / relements};
+        auto relements_rsqrt       = sqrt(relements_r);
+
+        auto means = r.reduce(op::sum{}, make_array<vec_value_type>(0, 0), [&](auto x) {
+            auto x_out = x * relements_r;
+            // dividing x by sqrt(relements) before squaring allows computing
+            // higher values before overflow in low precision
+            auto x2_sqrt = x * relements_rsqrt;
+            return make_array(x_out, x2_sqrt * x2_sqrt);
+        })(input);
+
+        auto mean_x            = means[0];
+        auto mean_x2           = means[1];
+        auto variance          = mean_x2 - (mean_x * mean_x);
+        vec_value_type eps_val = implicit_conversion(eps);
+        auto rsqrt_val         = rsqrt(variance + eps_val);
+
+        r.inner([&](auto& y, auto x, auto... xs) {
+            y = compute(migraphx::convert<vec_type<value_type>>((x - mean_x) * rsqrt_val), xs...);
+        })(output, input, inputs...);
+    });
+}
+
+template <index_int Axis, class F, class Output, class Input, class... Inputs>
+__device__ void layernorm(F compute, float eps, Output output, Input input, Inputs... inputs)
+{
+    generic_binary_layernorm<Axis>(
+        compute, [](auto x, auto) { return x; }, eps, output, input, input, inputs...);
+}
+
+template <index_int Axis, class F, class Output, class Input1, class Input2, class... Inputs>
+__device__ void
+add_layernorm(F compute, float eps, Output output, Input1 input1, Input2 input2, Inputs... inputs)
+{
+    generic_binary_layernorm<Axis>(
+        compute, [](auto x1, auto x2) { return x1 + x2; }, eps, output, input1, input2, inputs...);
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/math.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/math.hpp
new file mode 100644
index 000000000..5052d6611
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -0,0 +1,300 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_MATH_HPP
+#define MIGRAPHX_GUARD_KERNELS_MATH_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/vec.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/float8.hpp>
+#include <migraphx/kernels/pp.hpp>
+#include <migraphx/kernels/bit_cast.hpp>
+
+namespace migraphx {
+
+namespace math {
+
+template <class T>
+constexpr auto as_float(T x)
+{
+    if constexpr(is_integral<T>{})
+        return x;
+    else
+        return float(x);
+}
+
+template <class T>
+constexpr auto to_native(T x)
+{
+    return x;
+}
+
+constexpr migraphx::half to_native(__half x) { return bit_cast<migraphx::half>(x); }
+
+template <class F, class T, class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<T, Ts...>())>
+__device__ auto wrap(F f, T x, Ts... xs)
+{
+    if constexpr(is_integral<T>{})
+    {
+        return wrap(f, double(x), double(xs)...);
+    }
+    else if constexpr(is_callable<F, T, Ts...>{})
+    {
+        return to_native(f(x, xs...));
+    }
+    else
+    {
+        T result = f(as_float(x), as_float(xs)...);
+        return result;
+    }
+}
+
+} // namespace math
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_LIFT_IMPL(type, ...) \
+    [](type x, auto... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(x, xs...))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_LIFT(...) MIGRAPHX_DEVICE_MATH_LIFT_IMPL(__VA_ARGS__)
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_PARSE(x) x,
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_EACH(f) MIGRAPHX_DEVICE_MATH_LIFT(MIGRAPHX_DEVICE_MATH_PARSE f)
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_WRAP(name, ...)                                          \
+    namespace math {                                                                  \
+    inline static constexpr auto wrap_##name =                                        \
+        overload(MIGRAPHX_PP_TRANSFORM_ARGS(MIGRAPHX_DEVICE_MATH_EACH, __VA_ARGS__)); \
+    }                                                                                 \
+    template <class... Ts>                                                            \
+    auto __device__ name(Ts... xs) MIGRAPHX_RETURNS(math::wrap(math::wrap_##name, xs...))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH(name, fname)                              \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
+    auto __device__ name(Ts... xs) MIGRAPHX_RETURNS(fname(xs...))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_VEC(name)                                       \
+    template <class... Ts, MIGRAPHX_REQUIRES(is_any_vec<Ts...>())>           \
+    auto __device__ name(Ts... xs)                                           \
+    {                                                                        \
+        return vec_transform(xs...)([](auto... ys) { return name(ys...); }); \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
+    auto __device__ name(type x, Ts... xs) -> type                     \
+    {                                                                  \
+        return fname(x, xs...);                                        \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_HALF2(name, fname)                                           \
+    template <class... Ts>                                                                \
+    auto __device__ name(migraphx::vec<migraphx::half, 2> x, Ts... xs)                    \
+        MIGRAPHX_RETURNS(migraphx::vec<migraphx::half, 2>{fname(x, xs...)});              \
+    template <class... Ts, index_int N, MIGRAPHX_REQUIRES(N % 2 == 0 and (N > 2))>        \
+    auto __device__ name(migraphx::vec<migraphx::half, N> x, Ts... xs)                    \
+    {                                                                                     \
+        return vec_packed_transform<2>(x, xs...)(                                         \
+            [](auto... ys) -> migraphx::vec<migraphx::half, 2> { return fname(ys...); }); \
+    }
+
+// Template with two overloads for math functions, one for half2 type and one for more generic
+// <half, N> vectorization where N is 4 or another even number.
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_VEC2(type, name, fname)                               \
+    template <class... Ts>                                                         \
+    auto __device__ name(migraphx::vec<type, 2> x, Ts... xs)                       \
+        MIGRAPHX_RETURNS(migraphx::vec<type, 2>{fname(x, xs...)});                 \
+    template <class... Ts, index_int N, MIGRAPHX_REQUIRES(N % 2 == 0 and (N > 2))> \
+    auto __device__ name(migraphx::vec<type, N> x, Ts... xs)                       \
+    {                                                                              \
+        return vec_packed_transform<2>(x, xs...)(                                  \
+            [](auto... ys) -> migraphx::vec<type, 2> { return fname(ys...); });    \
+    }
+
+MIGRAPHX_DEVICE_MATH_WRAP(acos, (double)::acos, (float)::acosf);
+MIGRAPHX_DEVICE_MATH_WRAP(acosh, (double)::acosh, (float)::acoshf);
+MIGRAPHX_DEVICE_MATH_WRAP(asin, (double)::asin, (float)::asinf);
+MIGRAPHX_DEVICE_MATH_WRAP(asinh, (double)::asinh, (float)::asinh);
+MIGRAPHX_DEVICE_MATH_WRAP(atan, (double)::atan, (float)::atan);
+MIGRAPHX_DEVICE_MATH_WRAP(atanh, (double)::atanh, (float)::atanh);
+MIGRAPHX_DEVICE_MATH_WRAP(ceil, (double)::ceil, (float)::ceilf, (half)::hceil);
+MIGRAPHX_DEVICE_MATH_WRAP(cos, (double)::cos, (float)::cosf, (half)::hcos);
+MIGRAPHX_DEVICE_MATH_WRAP(cosh, (double)::cosh, (float)::coshf);
+MIGRAPHX_DEVICE_MATH_WRAP(erf, (double)::erf, (float)::erff);
+MIGRAPHX_DEVICE_MATH_WRAP(exp, (double)::exp, (float)::expf, (half)::hexp);
+MIGRAPHX_DEVICE_MATH_WRAP(floor, (double)::floor, (float)::floorf, (half)::hfloor);
+MIGRAPHX_DEVICE_MATH_WRAP(isnan, (double)::isnan, (float)::isnan, (half)::__hisnan);
+MIGRAPHX_DEVICE_MATH_WRAP(isinf, (double)::isinf, (float)::isinf, (half)::__hisinf);
+MIGRAPHX_DEVICE_MATH_WRAP(log, (double)::log, (float)::logf, (half)::hlog);
+MIGRAPHX_DEVICE_MATH_WRAP(log2, (double)::log2, (float)::log2f, (half)::hlog2);
+MIGRAPHX_DEVICE_MATH_WRAP(nearbyint, (double)::nearbyint, (float)::nearbyintf);
+MIGRAPHX_DEVICE_MATH_WRAP(pow, (double)::pow, (float)::powf);
+MIGRAPHX_DEVICE_MATH_WRAP(remainder, (double)::remainder, (float)::remainderf);
+MIGRAPHX_DEVICE_MATH_WRAP(round, (double)::round, (float)::roundf);
+MIGRAPHX_DEVICE_MATH_WRAP(rsqrt, (double)::rsqrt, (float)::rsqrtf, (half)::hrsqrt);
+MIGRAPHX_DEVICE_MATH_WRAP(sin, (double)::sin, (float)::sinf, (half)::hsin);
+MIGRAPHX_DEVICE_MATH_WRAP(sinh, (double)::sinh, (float)::sinhf);
+MIGRAPHX_DEVICE_MATH_WRAP(sqrt, (double)::sqrt, (float)::sqrtf, (half)::hsqrt);
+MIGRAPHX_DEVICE_MATH_WRAP(tan, (double)::tan, (float)::tanf);
+MIGRAPHX_DEVICE_MATH_WRAP(tanh, (double)::tanh, (float)::tanhf);
+MIGRAPHX_DEVICE_MATH_WRAP(fmod, (double)::fmod, (float)::fmodf);
+
+template <class T, class U>
+constexpr auto where(bool cond, const T& a, const U& b)
+{
+    return cond ? a : b;
+}
+
+MIGRAPHX_DEVICE_MATH_FOR(float, abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FOR(double, abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, abs, ::__habs)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::bf16, abs, ::fabsf)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, max, ::fmaxf)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, min, ::fminf)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, max, ::max)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)
+
+template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>() and is_integral<T>{})>
+constexpr auto abs(const T& a)
+{
+    return where(a < 0, -a, a);
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
+constexpr auto max(const T& a, const T& b)
+{
+    return where(a < b, b, a);
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
+constexpr auto min(const T& a, const T& b)
+{
+    return where(a < b, a, b);
+}
+
+template <class T, class U, MIGRAPHX_REQUIRES(not is_same<T, U>{} and not is_any_vec<T, U>())>
+constexpr auto max(const T& a, const U& b)
+{
+    return max<common_type_t<T, U>>(a, b);
+}
+
+template <class T, class U, MIGRAPHX_REQUIRES(not is_same<T, U>{} and not is_any_vec<T, U>())>
+constexpr auto min(const T& a, const U& b)
+{
+    return min<common_type_t<T, U>>(a, b);
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
+constexpr T mod(const T& a, const T& b)
+{
+    if constexpr(is_integral<T>{})
+        // onnx mod operator requires numpy style modulus
+        return ((a % b) + b) % b;
+    return static_cast<T>(fmod(remainder(a, b) + b, b));
+}
+
+template <class T, class U, MIGRAPHX_REQUIRES(not is_same<T, U>{} and not is_any_vec<T, U>())>
+constexpr auto mod(const T& a, const U& b)
+{
+    return mod<common_type_t<T, U>>(a, b);
+}
+
+MIGRAPHX_DEVICE_MATH_VEC(abs)
+MIGRAPHX_DEVICE_MATH_VEC(acos)
+MIGRAPHX_DEVICE_MATH_VEC(acosh)
+MIGRAPHX_DEVICE_MATH_VEC(asin)
+MIGRAPHX_DEVICE_MATH_VEC(asinh)
+MIGRAPHX_DEVICE_MATH_VEC(atan)
+MIGRAPHX_DEVICE_MATH_VEC(atanh)
+MIGRAPHX_DEVICE_MATH_VEC(ceil)
+MIGRAPHX_DEVICE_MATH_VEC(cos)
+MIGRAPHX_DEVICE_MATH_VEC(cosh)
+MIGRAPHX_DEVICE_MATH_VEC(erf)
+MIGRAPHX_DEVICE_MATH_VEC(exp)
+MIGRAPHX_DEVICE_MATH_VEC(floor)
+MIGRAPHX_DEVICE_MATH_VEC(fmod)
+MIGRAPHX_DEVICE_MATH_VEC(isinf)
+MIGRAPHX_DEVICE_MATH_VEC(isnan)
+MIGRAPHX_DEVICE_MATH_VEC(log)
+MIGRAPHX_DEVICE_MATH_VEC(log2)
+MIGRAPHX_DEVICE_MATH_VEC(max)
+MIGRAPHX_DEVICE_MATH_VEC(min)
+MIGRAPHX_DEVICE_MATH_VEC(mod)
+MIGRAPHX_DEVICE_MATH_VEC(nearbyint)
+MIGRAPHX_DEVICE_MATH_VEC(pow)
+MIGRAPHX_DEVICE_MATH_VEC(remainder)
+MIGRAPHX_DEVICE_MATH_VEC(round)
+MIGRAPHX_DEVICE_MATH_VEC(rsqrt)
+MIGRAPHX_DEVICE_MATH_VEC(sin)
+MIGRAPHX_DEVICE_MATH_VEC(sinh)
+MIGRAPHX_DEVICE_MATH_VEC(sqrt)
+MIGRAPHX_DEVICE_MATH_VEC(tan)
+MIGRAPHX_DEVICE_MATH_VEC(tanh)
+MIGRAPHX_DEVICE_MATH_VEC(where)
+
+// Map math functions to hip half2 functions
+// The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
+// packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
+// Most but not all of these math ops have operators of the same names.
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, abs, ::__habs2)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, ceil, ::h2ceil)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, cos, ::h2cos)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, exp, ::h2exp)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, exp10, ::h2exp10)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, exp2, ::h2exp2)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, floor, ::h2floor)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, isinf, ::__hisinf2)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, isnan, ::__hisnan2)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, log, ::h2log)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, log10, ::h2log10)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, log2, ::h2log2)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, rsqrt, ::h2rsqrt)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, sin, ::h2sin)
+MIGRAPHX_DEVICE_MATH_VEC2(migraphx::half, sqrt, ::h2sqrt)
+
+template <class T, class U>
+constexpr auto convert(U v)
+{
+    return vec_transform(v)([](auto x) -> T { return static_cast<T>(x); });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_MATH_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/operators.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/operators.hpp
new file mode 100644
index 000000000..35f9c920e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/operators.hpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_OPERATORS_HPP
+#define MIGRAPHX_GUARD_KERNELS_OPERATORS_HPP
+
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+
+namespace migraphx {
+
+template <class T>
+struct equality_comparable
+{
+    template <class U>
+    friend constexpr auto operator!=(const T& x, const U& y) MIGRAPHX_RETURNS(not(x == y));
+    template <class U, class V, MIGRAPHX_REQUIRES(not is_same<T, U>{} and is_same<V, T>{})>
+    friend constexpr auto operator!=(const U& x, const V& y) MIGRAPHX_RETURNS(not(x == y));
+};
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_OPERATORS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ops.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
new file mode 100644
index 000000000..be1ece0f9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
@@ -0,0 +1,164 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_OPS_HPP
+#define MIGRAPHX_GUARD_KERNELS_OPS_HPP
+
+#include <migraphx/kernels/math.hpp>
+
+namespace migraphx {
+namespace op {
+
+struct sum
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+
+struct product
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+
+struct id
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+template <class T>
+struct convert_to
+{
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(U x) const
+    {
+        return convert<T>(x);
+    }
+};
+
+template <index_int N>
+struct mean
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x) const
+    {
+        using type = vec_type<T>;
+        if constexpr(is_floating_point<type>{})
+        {
+            constexpr type d = 1.0 / N;
+            return x * d;
+        }
+        else
+        {
+            return x / static_cast<type>(N);
+        }
+    }
+};
+
+struct max
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return migraphx::max(x, y);
+    }
+};
+
+struct min
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return migraphx::min(x, y);
+    }
+};
+
+struct logical_and
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        if(static_cast<bool>(x) and static_cast<bool>(y))
+            return static_cast<T>(1);
+        return static_cast<T>(0);
+    }
+};
+
+struct logical_or
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        if(static_cast<bool>(x) or static_cast<bool>(y))
+            return static_cast<T>(1);
+        return static_cast<T>(0);
+    }
+};
+} // namespace op
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_OPS_DEFINE_COMMON_TYPE(T) \
+    template <class U>                     \
+    struct common_type<T, U>               \
+    {                                      \
+        using type = U;                    \
+    };                                     \
+    template <class U>                     \
+    struct common_type<U, T>               \
+    {                                      \
+        using type = U;                    \
+    };
+
+struct lowest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return numeric_lowest<vec_type<T>>();
+    }
+};
+MIGRAPHX_OPS_DEFINE_COMMON_TYPE(lowest)
+
+struct highest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return numeric_max<vec_type<T>>();
+    }
+};
+
+MIGRAPHX_OPS_DEFINE_COMMON_TYPE(highest)
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_OPS_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pad.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
new file mode 100644
index 000000000..38d8be2de
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -0,0 +1,64 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP
+
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/ranges.hpp>
+#include <migraphx/kernels/vec.hpp>
+
+namespace migraphx {
+
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
+{
+    auto output_shape = output.get_shape();
+    idx.global_stride(output_shape.elements(), [&](auto i) {
+        // 1. get current multi-index for output
+        // 2. get the size of the input to determine input boundaries
+        // 3. compute the corresponding multi-index for input by accounting for offsets
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
+        //    use pad value instead of input's value
+        auto multi        = output_shape.multi(i);
+        auto input_bounds = input.get_shape().lens;
+        auto input_idx    = multi - offsets;
+        auto range_multi  = range(multi.size());
+
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
+           }))
+            output[multi] = implicit_conversion(pad_val);
+        else
+            output[multi] = implicit_conversion(input[input_idx]);
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/permutation.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/permutation.hpp
new file mode 100644
index 000000000..970484d6f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/permutation.hpp
@@ -0,0 +1,108 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_PERMUTATION_HPP
+#define MIGRAPHX_GUARD_KERNELS_PERMUTATION_HPP
+
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/tuple.hpp>
+
+namespace migraphx {
+
+template <class Array1, class Array2>
+constexpr auto reorder_dims(const Array1& dims, const Array2& permutation)
+{
+    return generate_array<typename Array1::value_type>(
+        dims.size(), [&](auto i) { return dims[permutation[i]]; });
+}
+
+template <class T, T... Xs, class U, U... Ys>
+constexpr auto reorder_dims(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>)
+{
+    return return_array_c([] {
+        constexpr integral_const_array<T, Xs...> dims{};
+        constexpr integral_const_array<U, Ys...> permutation{};
+        return reorder_dims(dims.base(), permutation.base());
+    });
+}
+
+template <class Array>
+constexpr auto invert_permutation(const Array& permutation)
+{
+    return reorder_dims(transform_i(permutation, [](auto, auto i) { return i; }), permutation);
+}
+
+template <class Shape>
+struct find_permutation_impl
+{
+    static constexpr auto compute()
+    {
+        return return_array_c([] {
+            constexpr Shape s{};
+            typename Shape::index_array perm;
+            iota(perm.begin(), perm.end(), 0);
+            if constexpr(s.transposed() or s.broadcasted())
+            {
+                stable_sort(
+                    perm.begin(),
+                    perm.end(),
+                    by([&](auto x) { return make_tuple(s.strides[x], s.lens[x]); }, greater{}));
+            }
+            return perm;
+        });
+    }
+    using type = decltype(compute());
+};
+
+template <class Shape>
+constexpr auto find_permutation(Shape)
+{
+    return typename find_permutation_impl<Shape>::type{};
+}
+
+template <class Shape1, class Shape2>
+constexpr auto find_permutation(Shape1, Shape2)
+{
+    return return_array_c([] {
+        constexpr Shape1 s1{};
+        constexpr Shape2 s2{};
+        auto perm1 = find_permutation(s1).base();
+        auto perm2 = find_permutation(s2).base();
+        if(perm1 == perm2)
+            return perm1;
+        if(s1.standard())
+            return perm1;
+        if(s2.standard())
+            return perm2;
+        if(s1.packed())
+            return perm1;
+        if(s2.packed())
+            return perm2;
+        return perm1;
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PERMUTATION_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
new file mode 100644
index 000000000..d97355a1d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+#define MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/preload.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/args.hpp>
+#include <migraphx/kernels/tile.hpp>
+#include <migraphx/kernels/tuple.hpp>
+
+namespace migraphx {
+
+template <class Stride, class F, class Output, class T, class... Ts>
+__device__ void pointwise_tensor(Stride stride, F f, Output out, T x, Ts... xs)
+{
+    stride(x.get_shape().elements(), [&](auto i) {
+        auto r = f(x[i], xs[i]...);
+        out([&](auto... outs) {
+            r([&](auto... rs) {
+                static_assert(sizeof...(outs) == sizeof...(rs));
+                swallow{(outs[i] = implicit_conversion(rs))...};
+            });
+        });
+    });
+}
+
+template <index_int N, bool Tiled, class... Transforms>
+__device__ auto pointwise(index idx, Transforms... transforms)
+{
+    return [=](auto f, auto*... ps) {
+        auto t = transform_args(make_tensors(), transforms..., rotate_and_pack_last<N>());
+        t(ps...)([&](auto... xs) { pointwise_tensor(tile_stride<Tiled>(idx), f, xs...); });
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
new file mode 100644
index 000000000..76bb7c3cb
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
@@ -0,0 +1,233 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_POOLING_HPP
+#define MIGRAPHX_GUARD_KERNELS_POOLING_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/tuple.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+
+namespace migraphx {
+
+template <class Derived>
+struct pool_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T apply(T x) const
+    {
+        return x;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto pad() const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        return self.init();
+    }
+
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, U) const
+    {
+        return x;
+    }
+};
+
+struct max_pool : pool_op<max_pool>
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest{}; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto reduce() const { return op::max{}; }
+};
+
+struct average_pool : pool_op<average_pool>
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return make_tuple(0.0, 0); }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR tuple<T, index_int> apply(T x) const
+    {
+        return {x, 1};
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto reduce() const { return op::sum{}; }
+
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(tuple<T, index_int> t, U) const
+    {
+        T x         = t[_c<0>];
+        index_int y = t[_c<1>];
+        return (y == 0) ? T{0.0} : T{x / y};
+    }
+};
+
+struct average_include_pad_pool : pool_op<average_include_pad_pool>
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return 0.0; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto reduce() const { return op::sum{}; }
+
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, U y) const
+    {
+        if constexpr(y == 0)
+            return T{0.0};
+        constexpr auto scale = T{1.0} / y;
+        return T{x * scale};
+    }
+};
+
+struct lpnorm_pool_base
+{
+};
+
+template <index_int P>
+struct lpnorm_pool : lpnorm_pool_base, pool_op<lpnorm_pool<P>>
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return 0.0; }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T apply(T x) const
+    {
+        if constexpr(P == 0)
+            return 1;
+        else if constexpr(P == 1)
+            return migraphx::abs(x);
+        else if constexpr(P == 2)
+            return x * x;
+        else
+            return migraphx::pow(migraphx::abs(x), T(P));
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto pad() const { return apply(init()); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto reduce() const { return op::sum{}; }
+
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, U) const
+    {
+        if constexpr(P == 0)
+            return 1;
+        else if constexpr(P == 1)
+            return x;
+        else if constexpr(P == 2)
+            return migraphx::sqrt(x);
+        else
+            return migraphx::pow(x, 1. / P);
+    }
+};
+
+template <class Window, class Stride, class Padding>
+struct window
+{
+    Window win      = {};
+    Stride stride   = {};
+    Padding padding = {};
+
+    using rank = decltype(Window{}.size());
+
+    constexpr auto size() const
+    {
+        return return_c([] { return Window{}.product(); });
+    }
+
+    constexpr auto has_padding() const
+    {
+        return return_c([] { return Padding{} == 0; });
+    }
+
+    template <class OutputIndex, class F>
+    constexpr auto apply(OutputIndex i, F f) const
+    {
+        auto win_start = generate_array<diff_int>(rank{}, [&](auto j) {
+            diff_int dim = i[j];
+            MIGRAPHX_ASSERT(win[j] >= 1);
+            diff_int s = stride[j];
+            diff_int p = padding[j];
+            return (dim * s) - p;
+        });
+        return [=](auto j) { return f(win_start + win.multi(j)); };
+    }
+
+    template <class Index, class F>
+    constexpr void visit(Index i, F f) const
+    {
+        repeat(size(), apply(i, f));
+    }
+};
+
+template <class Window, class Stride, class Padding>
+constexpr window<Window, Stride, Padding> make_window(Window w, Stride s, Padding p)
+{
+    return {w, s, p};
+}
+
+template <class Algo, index_int GroupSize, class Output, class F>
+__device__ void pooling_reduce(Output output, F f)
+{
+    if constexpr(GroupSize < 2)
+    {
+        Algo::template run<decltype(output)>(
+            [&](auto out_idx, auto r) { r.outer([&] { output[out_idx] = f(out_idx, r); }); });
+    }
+    else
+    {
+        auto goutput = as_vec<GroupSize>(output, output.get_shape().lens.size() - _c<1>);
+        Algo::template run<decltype(goutput)>([&](auto out_idx, auto r) {
+            auto i = out_idx;
+            i.back() *= GroupSize;
+            auto result = vec_generate<GroupSize>([&](auto) {
+                i.back()++;
+                return f(i, r);
+            });
+            r.outer([&] { goutput[out_idx] = result; });
+        });
+    }
+}
+
+template <class Algo, index_int GroupSize, class Op, class Window, class Output, class Input>
+__device__ void pooling(Op op, Window w, Output output, Input input)
+{
+    pooling_reduce<Algo, GroupSize>(output, [&](auto out_idx, auto r) {
+        auto x = r.reduce(op.reduce(), op.init(), w.apply(out_idx, [&](auto j) {
+            using itype = decltype(op.apply(input[j]));
+
+            if(j < input.get_shape().lens)
+            {
+                return op.apply(input[j]);
+            }
+            else
+            {
+                return itype(op.pad());
+            }
+        }))(reduce::make_indices(w.size()));
+        return op.final(x, w.size());
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_POOLING_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pp.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pp.hpp
new file mode 100644
index 000000000..89b38ac24
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/pp.hpp
@@ -0,0 +1,129 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_PP_HPP
+#define MIGRAPHX_GUARD_KERNELS_PP_HPP
+
+// NOLINTBEGIN(*-macro-to-enum)
+
+#define MIGRAPHX_PP_PRIMITIVE_CAT(x, y) x##y
+#define MIGRAPHX_PP_CAT(x, y) MIGRAPHX_PP_PRIMITIVE_CAT(x, y)
+
+#define MIGRAPHX_PP_EAT(...)
+#define MIGRAPHX_PP_EXPAND(...) __VA_ARGS__
+#define MIGRAPHX_PP_COMMA(...) ,
+
+#define MIGRAPHX_PP_IIF(c) MIGRAPHX_PP_PRIMITIVE_CAT(MIGRAPHX_PP_IIF_, c)
+#define MIGRAPHX_PP_IIF_0(t, ...) __VA_ARGS__
+#define MIGRAPHX_PP_IIF_1(t, ...) t
+
+#define MIGRAPHX_PP_COMPL(b) MIGRAPHX_PP_PRIMITIVE_CAT(MIGRAPHX_PP_COMPL_, b)
+#define MIGRAPHX_PP_COMPL_0 1
+#define MIGRAPHX_PP_COMPL_1 0
+
+#define MIGRAPHX_PP_BITAND(x) MIGRAPHX_PP_PRIMITIVE_CAT(MIGRAPHX_PP_BITAND_, x)
+#define MIGRAPHX_PP_BITAND_0(y) 0
+#define MIGRAPHX_PP_BITAND_1(y) y
+
+#define MIGRAPHX_PP_CHECK(...) MIGRAPHX_PP_CHECK_N(__VA_ARGS__, 0, )
+#define MIGRAPHX_PP_CHECK_N(x, n, ...) n
+#define MIGRAPHX_PP_PROBE(x) x, 1,
+
+#define MIGRAPHX_PP_IS_PAREN(x) MIGRAPHX_PP_CHECK(MIGRAPHX_PP_IS_PAREN_PROBE x)
+#define MIGRAPHX_PP_IS_PAREN_PROBE(...) MIGRAPHX_PP_PROBE(~)
+
+#define MIGRAPHX_PP_PRIMITIVE_IS_EMPTY(x) \
+    MIGRAPHX_PP_CHECK(MIGRAPHX_PP_PRIMITIVE_IS_EMPTY_PROBE x())
+#define MIGRAPHX_PP_PRIMITIVE_IS_EMPTY_PROBE(...) MIGRAPHX_PP_PROBE(~)
+
+#define MIGRAPHX_PP_IS_EMPTY_ARG(x)                                \
+    MIGRAPHX_PP_BITAND(MIGRAPHX_PP_COMPL(MIGRAPHX_PP_IS_PAREN(x))) \
+    (MIGRAPHX_PP_PRIMITIVE_IS_EMPTY(x))
+
+#define MIGRAPHX_PP_REPEAT0(m, ...) m(0, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT1(m, ...) MIGRAPHX_PP_REPEAT0(m, __VA_ARGS__) m(1, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT2(m, ...) MIGRAPHX_PP_REPEAT1(m, __VA_ARGS__) m(2, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT3(m, ...) MIGRAPHX_PP_REPEAT2(m, __VA_ARGS__) m(3, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT4(m, ...) MIGRAPHX_PP_REPEAT3(m, __VA_ARGS__) m(4, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT5(m, ...) MIGRAPHX_PP_REPEAT4(m, __VA_ARGS__) m(5, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT6(m, ...) MIGRAPHX_PP_REPEAT5(m, __VA_ARGS__) m(6, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT7(m, ...) MIGRAPHX_PP_REPEAT6(m, __VA_ARGS__) m(7, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT8(m, ...) MIGRAPHX_PP_REPEAT7(m, __VA_ARGS__) m(8, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT9(m, ...) MIGRAPHX_PP_REPEAT8(m, __VA_ARGS__) m(9, __VA_ARGS__)
+#define MIGRAPHX_PP_REPEAT10(m, ...) MIGRAPHX_PP_REPEAT9(m, __VA_ARGS__) m(10, __VA_ARGS__)
+
+#define MIGRAPHX_PP_REPEAT(n, m, ...) \
+    MIGRAPHX_PP_PRIMITIVE_CAT(MIGRAPHX_PP_REPEAT, n)(m, __VA_ARGS__)
+
+#define MIGRAPHX_PP_RES_ARGS() , , , , , , , , , , , , , , ,
+
+#define MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARGS(...) \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARGS_IMPL(__VA_ARGS__)
+
+#define MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARGS_IMPL(                                       \
+    m, delim, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, ...) \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x0)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x1)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x1)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x2)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x2)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x3)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x3)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x4)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x4)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x5)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x5)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x6)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x6)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x7)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x7)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x8)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x8)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x9)                                       \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x9)                                           \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x10)                                      \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x10)                                          \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x11)                                      \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x11)                                          \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x12)                                      \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x12)                                          \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x13)                                      \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x13)                                          \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x14)                                      \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x14)                                          \
+    MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(delim, x15) MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x15)
+
+#define MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARG(m, x) \
+    MIGRAPHX_PP_IIF(MIGRAPHX_PP_IS_EMPTY_ARG(x))(MIGRAPHX_PP_EAT, m)(x)
+
+#define MIGRAPHX_PP_EACH_ARGS(m, ...)                        \
+    MIGRAPHX_PP_EXPAND(MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARGS( \
+        m, MIGRAPHX_PP_EAT, __VA_ARGS__, MIGRAPHX_PP_RES_ARGS()))
+
+#define MIGRAPHX_PP_TRANSFORM_ARGS(m, ...)                   \
+    MIGRAPHX_PP_EXPAND(MIGRAPHX_PP_PRIMITIVE_TRANSFORM_ARGS( \
+        m, MIGRAPHX_PP_COMMA, __VA_ARGS__, MIGRAPHX_PP_RES_ARGS()))
+
+// NOLINTEND(*-macro-to-enum)
+
+#endif // MIGRAPHX_GUARD_KERNELS_PP_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/preload.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
new file mode 100644
index 000000000..3978d0af3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -0,0 +1,198 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
+
+namespace migraphx {
+
+template <class T>
+struct remove_vec_impl
+{
+    using type = T;
+};
+
+template <class T, index_int N>
+struct remove_vec_impl<vec<T, N>>
+{
+    using type = T;
+};
+
+template <class T>
+using remove_vec = typename remove_vec_impl<T>::type;
+
+template <class T, class... Shapes>
+constexpr auto traverse_preload(Shapes... ss)
+{
+    return [=](auto f, auto... g) {
+        index_int offset = 0;
+        auto each        = [&](auto x) {
+            using type          = remove_vec<typename decltype(x)::type>;
+            constexpr auto s    = decltype(x.get_shape()){};
+            constexpr auto size = s.element_space();
+            if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
+                         not is_same<T, type>{})
+                return f(x, offset, false_type{});
+            else
+            {
+                auto pre_offset = offset;
+                offset += size;
+                offset += offset % 4;
+                return f(x, pre_offset, true_type{});
+            }
+        };
+        return by(each, g...)(ss...);
+    };
+}
+
+template <class T, class... Shapes>
+constexpr index_int compute_preload_size_c(Shapes...)
+{
+    index_int size = 0;
+    traverse_preload<T>(Shapes{}...)(
+        [&](auto s, auto offset, auto) { size = offset + s.element_space(); });
+    return size;
+}
+
+template <class T, class... Shapes>
+constexpr auto compute_preload_size(Shapes...)
+{
+    return _c<compute_preload_size_c<T>(Shapes{}...)>;
+}
+
+template <class F, class T, class... Ts>
+__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
+{
+    auto invoke = [&](auto... ys) {
+        __syncthreads();
+        f(ys...);
+    };
+    traverse_preload<T>(xs...)(
+        [&](auto x, auto offset, auto copy) {
+            if constexpr(copy)
+            {
+                if constexpr(decltype(tensor_vec_size(x)){} == 0)
+                {
+                    auto v = auto_vectorize(x);
+                    auto b = as_vec(tensor_vec_size(v), buffer + offset);
+                    idx.local_stride(v.get_shape().element_space(),
+                                     [&](auto i) { b[i] = v.data()[i]; });
+                    return x.with(buffer + offset);
+                }
+                else
+                {
+                    auto b = as_vec(tensor_vec_size(x), buffer + offset);
+                    idx.local_stride(x.get_shape().element_space(),
+                                     [&](auto i) { b[i] = x.data()[i]; });
+                    return x.with(b);
+                }
+            }
+            else
+            {
+                return x;
+            }
+        },
+        invoke);
+}
+
+template <class T, class Shape>
+struct shape_type : Shape
+{
+    using type = T;
+};
+
+template <class T>
+constexpr auto make_shape_type(T)
+{
+    return shape_type<typename T::type, typename T::shape_type>{};
+}
+
+template <class T, class... Ts>
+__device__ auto preload(index idx, Ts... xs)
+{
+    using type               = remove_vec<T>;
+    constexpr auto size      = decltype(compute_preload_size<type>(make_shape_type(xs)...)){};
+    const index_int max_size = 512 * sizeof(type);
+    return [=](auto f) {
+        if constexpr(size > 0 and size < max_size)
+        {
+            __shared__ type buffer[size];
+            preload_copy(idx, f, buffer, xs...);
+        }
+        else
+        {
+            f(xs...);
+        }
+    };
+}
+
+inline __device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto out, auto... xs) {
+        preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
+    });
+}
+
+template <bool B, class T>
+__device__ auto preload_copy(index idx, T x)
+{
+    return [=](auto f) {
+        if constexpr(B)
+        {
+            using type          = typename T::type;
+            constexpr auto size = get_shape_c<T>{}.element_space();
+            __shared__ type buffer[size];
+            // TODO: Always vecotrize when size > 4, and then use a second loop for remainder
+            constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
+            auto input       = as_vec<n>(remove_bool(x.data()));
+            auto b           = as_vec<n>(remove_bool(buffer));
+            idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
+            return f(x.with(buffer));
+        }
+        else
+        {
+            return f(x);
+        }
+    };
+}
+
+template <bool... Bs>
+__device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto... xs) {
+        auto invoke = [=](auto... ys) {
+            if constexpr((Bs or ...))
+                __syncthreads();
+            f(ys...);
+        };
+        join(invoke, preload_copy<Bs>(idx, xs)...);
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/print.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/print.hpp
new file mode 100644
index 000000000..a12424535
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/print.hpp
@@ -0,0 +1,270 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+
+#include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+
+namespace migraphx {
+
+template <class F, class G>
+struct on_exit
+{
+    F f;
+    G g;
+    template <class T>
+    __host__ __device__ auto operator()(T x) const
+    {
+        return f(x);
+    }
+
+    __host__ __device__ ~on_exit() { f(g); }
+};
+
+template <class PrivateMIGraphXTypeNameProbe>
+constexpr auto print_type_name_probe()
+{
+    constexpr auto name                = __PRETTY_FUNCTION__;
+    constexpr auto size                = sizeof(__PRETTY_FUNCTION__);
+    constexpr auto parameter_name      = "PrivateMIGraphXTypeNameProbe = ";
+    constexpr auto parameter_name_size = sizeof("PrivateMIGraphXTypeNameProbe = ") - 1;
+    constexpr auto begin =
+        search(name, name + size, parameter_name, parameter_name + parameter_name_size);
+    static_assert(begin < name + size, "Type probe not found.");
+    constexpr auto start = begin + parameter_name_size;
+    constexpr auto last  = find_if(start, name + size, [](auto c) { return c == ']' or c == ';'; });
+    return [=](const auto& s) { s.print_string(start, last - start); };
+}
+
+template <class T>
+struct type_printer
+{
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& s, type_printer)
+    {
+        print_type_name_probe<T>()(s);
+        return s;
+    }
+};
+
+template <class T>
+constexpr type_printer<T> type_of()
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<T> type_of(T)
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of()
+{
+    return {};
+}
+
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of(T)
+{
+    return {};
+}
+
+template <class F>
+struct basic_printer
+{
+    F f;
+    __host__ __device__ const basic_printer& print_long(long value) const
+    {
+        f([&] { printf("%li", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_ulong(unsigned long value) const
+    {
+        f([&] { printf("%lu", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_char(char value) const
+    {
+        f([&] { printf("%c", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value) const
+    {
+        f([&] { printf("%s", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value, int size) const
+    {
+        f([&] { printf("%.*s", size, value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_double(double value) const
+    {
+        f([&] { printf("%f", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_bool(bool value) const
+    {
+        f([&] {
+            if(value)
+                printf("true");
+            else
+                printf("false");
+        });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& operator<<(short value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned short value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(int value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned int value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(long value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned long value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(migraphx::half value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(float value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(double value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(bool value) const
+    {
+        return print_bool(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(const char* value) const
+    {
+        return print_string(value);
+    }
+};
+
+template <class F>
+constexpr basic_printer<F> make_printer(F f)
+{
+    return {f};
+}
+
+template <class F, class G>
+constexpr basic_printer<on_exit<F, G>> make_printer(F f, G g)
+{
+    return {{f, g}};
+}
+
+inline __device__ auto cout()
+{
+    return make_printer([](auto f) { f(); });
+}
+
+inline __device__ auto coutln()
+{
+    return make_printer([](auto f) { f(); }, [] { printf("\n"); });
+}
+
+template <class Stream, class T, class... Ts>
+__device__ void unsafe_print_each(Stream s, T x, Ts... xs)
+{
+    s << x;
+    each_args([&](auto xx) { s << ' ' << xx; }, xs...);
+}
+
+template <class Stream, class... Ts>
+__device__ void print_each(Stream s, Ts... xs)
+{
+    auto idx = make_index();
+    for(auto i = 0; i < idx.nglobal(); i++)
+    {
+        if(i == idx.global)
+            unsafe_print_each(s, xs...);
+        __syncthreads();
+    }
+}
+
+template <class Stream, class... Ts>
+__device__ void print_each_once(Stream s, Ts... xs)
+{
+    auto idx = make_index();
+    if(idx.global == 0)
+        unsafe_print_each(s, xs...);
+}
+
+template <class... Ts>
+__device__ void print(Ts... xs)
+{
+    print_each(cout(), xs...);
+}
+
+template <class... Ts>
+__device__ void print_once(Ts... xs)
+{
+    print_each_once(cout(), xs...);
+}
+
+template <class... Ts>
+__device__ void println(Ts... xs)
+{
+    print_each(cout(), xs..., '\n');
+}
+
+template <class... Ts>
+__device__ void println_once(Ts... xs)
+{
+    print_each_once(cout(), xs..., '\n');
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRINT_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
new file mode 100644
index 000000000..af32a723b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+
+#include <migraphx/kernels/iota_iterator.hpp>
+
+namespace migraphx {
+
+template <class Iterator>
+struct iterator_range
+{
+    Iterator start;
+    Iterator last;
+
+    constexpr Iterator begin() const { return start; }
+
+    constexpr Iterator end() const { return last; }
+};
+
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/rank.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/rank.hpp
new file mode 100644
index 000000000..5765b4f3e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/rank.hpp
@@ -0,0 +1,41 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_RANK_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANK_HPP
+
+namespace migraphx {
+
+template <int N>
+struct rank : rank<N - 1>
+{
+};
+
+template <>
+struct rank<0>
+{
+};
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANK_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
new file mode 100644
index 000000000..76150cbfc
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -0,0 +1,785 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
+#define MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
+
+#include <migraphx/kernels/dpp.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/scatter_reduction_modes.hpp>
+#include <migraphx/kernels/tuple.hpp>
+#include <migraphx/kernels/pp.hpp>
+
+namespace migraphx {
+
+#if MIGRAPHX_HAS_DPP
+
+template <unsigned int SubWaveSize, class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    static_assert(SubWaveSize <= MIGRAPHX_WAVEFRONTSIZE, "Too large subwave size");
+    static_assert(is_power_of_2(SubWaveSize), "SubWaveSize is not a power of 2");
+    if constexpr(SubWaveSize > 1)
+    {
+        auto out = dpp_mov<dpp_row_shr(1)>(in);
+        in       = op(in, out);
+    }
+    if constexpr(SubWaveSize > 2)
+    {
+        auto out = dpp_mov<dpp_row_shr(2)>(in);
+        in       = op(in, out);
+    }
+    if constexpr(SubWaveSize > 4)
+    {
+        auto out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
+        in       = op(in, out);
+    }
+    if constexpr(SubWaveSize > 8)
+    {
+        auto out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
+        in       = op(in, out);
+    }
+#if MIGRAPHX_WAVEFRONTSIZE == 32
+    if constexpr(SubWaveSize > 16)
+    {
+        auto out = dpp_swizzle<0x1e0>(in);
+        in       = op(in, out);
+    }
+#else
+    if constexpr(SubWaveSize > 16)
+    {
+        auto out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
+        in       = op(in, out);
+    }
+    if constexpr(SubWaveSize > 32)
+    {
+        auto out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
+        in       = op(in, out);
+    }
+#endif
+}
+
+#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
+// NOLINTNEXTLINE
+#define MIGRAPHX_DPP_REDUCE_ASM_FUN(type, op, ins)   \
+    template <unsigned int SubWaveSize>              \
+    __device__ inline void dpp_reduce(type& x, op f) \
+    {                                                \
+        (void)f;                                     \
+        x = 1;                                       \
+    }
+#else
+#define MIGRAPHX_DPP_IIF64(then, ...) then
+#define MIGRAPHX_DPP_IIF32(then, ...) __VA_ARGS__
+#define MIGRAPHX_DPP_IF_64(x) MIGRAPHX_PP_CAT(MIGRAPHX_DPP_IIF, x)
+#define MIGRAPHX_DPP_WHEN_64(x) MIGRAPHX_DPP_IF_64(x)(MIGRAPHX_PP_EXPAND, MIGRAPHX_PP_EAT)
+
+#define MIGRAPHX_DPP_REDUCE_ASM0(ins) #ins " %0 %0 %0 row_shr:1\n"
+#define MIGRAPHX_DPP_REDUCE_ASM1(ins) #ins " %0 %0 %0 row_shr:2\n"
+#define MIGRAPHX_DPP_REDUCE_ASM2(ins) #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n"
+#define MIGRAPHX_DPP_REDUCE_ASM3(ins) #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n"
+#define MIGRAPHX_DPP_REDUCE_ASM4(ins) #ins " %0 %0 %0 row_bcast:15 row_mask:0xa\n"
+#define MIGRAPHX_DPP_REDUCE_ASM5(ins) #ins " %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+
+#define MIGRAPHX_DPP_REDUCE_ASM_REPEAT(i, ins) \
+    MIGRAPHX_PP_CAT(MIGRAPHX_DPP_REDUCE_ASM, i)(ins) "s_nop 1\n"
+#define MIGRAPHX_DPP_REDUCE_ASM(n, x, ins, ...)                                                 \
+    {                                                                                           \
+        __asm__ volatile("s_nop 4\n" MIGRAPHX_PP_REPEAT(n, MIGRAPHX_DPP_REDUCE_ASM_REPEAT, ins) \
+                         : "=v"(x)                                                              \
+                         : "0"(x));                                                             \
+        __VA_ARGS__                                                                             \
+    }
+
+#if MIGRAPHX_WAVEFRONTSIZE == 64
+#define MIGRAPHX_DPP_REDUCE_SWIZZLE(x, f) (void)f;
+#else
+#define MIGRAPHX_DPP_REDUCE_SWIZZLE(x, f) \
+    auto y = dpp_swizzle<0x1e0>(x);       \
+    x      = f(x, y);
+#endif
+
+#define MIGRAPHX_DPP_REDUCE_ASM_FUN(type, op, ins)                                    \
+    template <unsigned int SubWaveSize>                                               \
+    __device__ inline void dpp_reduce(type& x, op f)                                  \
+    {                                                                                 \
+        if constexpr(SubWaveSize == 2)                                                \
+            MIGRAPHX_DPP_REDUCE_ASM(0, x, ins, );                                     \
+        if constexpr(SubWaveSize == 4)                                                \
+            MIGRAPHX_DPP_REDUCE_ASM(1, x, ins, );                                     \
+        if constexpr(SubWaveSize == 8)                                                \
+            MIGRAPHX_DPP_REDUCE_ASM(2, x, ins, );                                     \
+        if constexpr(SubWaveSize == 16)                                               \
+            MIGRAPHX_DPP_REDUCE_ASM(3, x, ins, );                                     \
+        if constexpr(SubWaveSize == 32)                                               \
+            MIGRAPHX_DPP_REDUCE_ASM(MIGRAPHX_DPP_IF_64(MIGRAPHX_WAVEFRONTSIZE)(4, 3), \
+                                    x,                                                \
+                                    ins,                                              \
+                                    MIGRAPHX_DPP_REDUCE_SWIZZLE(x, f));               \
+        MIGRAPHX_DPP_WHEN_64(MIGRAPHX_WAVEFRONTSIZE)                                  \
+        (if constexpr(SubWaveSize == 64) MIGRAPHX_DPP_REDUCE_ASM(5, x, ins, ));       \
+    }
+#endif
+
+// Navi21 doesn't support int32 dpp
+#if defined(__gfx1030__)
+// NOLINTNEXTLINE
+#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)              \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(double, op, prefix##_f64); \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(float, op, prefix##_f32);  \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(half, op, prefix##_f16);   \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(uint32_t, op, prefix##_u32);
+#else
+// NOLINTNEXTLINE
+#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)                   \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(double, op, prefix##_f64);      \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(float, op, prefix##_f32);       \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(half, op, prefix##_f16);        \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(int32_t, op, prefix##sign##32); \
+    MIGRAPHX_DPP_REDUCE_ASM_FUN(uint32_t, op, prefix##_u32);
+#endif
+
+// Note: when max and min are in int32_t, signed version of instruction needs to be used.
+MIGRAPHX_DPP_REDUCE(op::sum, v_add, _u)
+MIGRAPHX_DPP_REDUCE(op::product, v_mul, _u)
+MIGRAPHX_DPP_REDUCE(op::max, v_max, _i)
+MIGRAPHX_DPP_REDUCE(op::min, v_min, _i)
+
+template <class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    dpp_reduce<MIGRAPHX_WAVEFRONTSIZE>(in, op);
+}
+
+template <unsigned int SubWaveSize, class Op, class T, class Index, class F>
+__device__ auto subwave_reduce(index idx, Op op, T init, Index n, F f)
+{
+    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal() or (idx.nlocal() % SubWaveSize) == 0);
+    using type = decltype(index::invoke_loop(f, 0, _c<0>));
+    auto x     = type(init);
+    idx.local_subwave_stride<SubWaveSize>(
+        n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
+    dpp_reduce<SubWaveSize>(x, op);
+    return readlane<SubWaveSize - 1, SubWaveSize>(x);
+}
+
+template <class Op, class T, class Index, class F>
+__device__ auto wave_reduce(index idx, Op op, T init, Index n, F f)
+{
+    return subwave_reduce<MIGRAPHX_WAVEFRONTSIZE>(idx, op, init, n, f);
+}
+
+template <class Op, class T, class Index, class F>
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
+{
+    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    if constexpr(decltype(idx.nlocal()){} == MIGRAPHX_WAVEFRONTSIZE)
+        return wave_reduce(idx, op, init, n, f);
+#endif
+    constexpr index_int lanes_per_thread = MIGRAPHX_WAVEFRONTSIZE;
+    using type = decltype(index::invoke_loop(f, 0, _c<0>));
+    __shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
+    auto x = type(init);
+    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
+    dpp_reduce(x, op);
+
+    const auto ldsidx = idx.local / lanes_per_thread;
+    if((idx.local % lanes_per_thread) == lanes_per_thread - 1)
+    {
+        buffer[ldsidx] = x;
+    }
+    __syncthreads();
+
+    type y = type(init);
+    for(index_int i = 0; i < idx.nlocal() / lanes_per_thread; i++)
+    {
+        y = op(y, buffer[i]);
+    }
+    return y;
+}
+#else
+template <class Op, class T, class Index, class F>
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
+{
+    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
+    using type = decltype(index::invoke_loop(f, 0, _c<0>));
+    __shared__ type buffer[idx.max_nlocal()];
+    auto x = type(init);
+    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
+    buffer[idx.local] = x;
+    __syncthreads();
+
+    for(index_int s = 1; s < idx.nlocal(); s *= 2)
+    {
+        const index_int index = 2 * s * idx.local;
+        if(index + s < idx.nlocal())
+        {
+            buffer[index] = op(buffer[index], buffer[index + s]);
+        }
+        __syncthreads();
+    }
+    return buffer[0];
+}
+#endif
+
+template <class Output, class Input, class T>
+constexpr auto reduce_slice(Input input, T i)
+{
+    constexpr auto lens = transform(get_shape_c<Input>{}.lens,
+                                    get_shape_c<Output>{}.lens,
+                                    [](index_int x, index_int y) -> index_int {
+                                        if(x == y)
+                                            return 1;
+                                        return x;
+                                    });
+    ;
+    constexpr auto s = make_shape(lens, get_shape_c<Input>{}.strides);
+    MIGRAPHX_ASSERT((input.get_shape().index(i) + s.element_space()) <=
+                    input.get_shape().element_space());
+    return make_tensor_view(&input[i], s);
+}
+
+namespace reduce {
+
+struct inner_storage_tag
+{
+};
+
+template <class T>
+using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>;
+
+template <class Size, class F>
+struct lazy_inner_storage : inner_storage_tag
+{
+    using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
+    F f;
+    constexpr Size rsize() const { return {}; }
+    template <class U, class V>
+    constexpr auto operator()(U j, V d) const
+    {
+        return f(j, d);
+    }
+};
+
+template <class Size, class F>
+constexpr lazy_inner_storage<Size, F> make_lazy_inner_storage(Size, F f)
+{
+    return {{}, f};
+}
+
+template <class Size>
+constexpr auto make_indices(Size size)
+{
+    return make_lazy_inner_storage(size, [](auto j, auto) { return j; });
+}
+
+template <class R, class F>
+struct storage_access : F
+{
+    using type = R;
+};
+
+template <class R, class F>
+constexpr storage_access<R, F> make_storage_access(F f)
+{
+    return {{f}};
+}
+
+template <class Slicer, class F>
+constexpr auto sliced(Slicer slicer, F f)
+{
+    return [=](auto x, auto... xs) {
+        // TODO: assert all elements are the same
+        return f(slicer(x), slicer(xs)...);
+    };
+}
+
+template <class Input, index_int Axis>
+constexpr auto compute_reduce_axis()
+{
+    constexpr auto lens =
+        transform_i(get_shape_c<Input>{}.lens, [](index_int x, index_int i) -> index_int {
+            if(i == Axis)
+                return 1;
+            return x;
+        });
+    return make_shape(lens, get_shape_c<Input>{}.strides);
+}
+
+template <class T, class F>
+constexpr auto final_reduce(T x, F f)
+{
+    return vec_reduce(x, f);
+}
+
+template <class T, index_int N, class F>
+constexpr auto final_reduce(array<T, N> a, F f)
+{
+    return a.apply([&](auto x) { return final_reduce(x, f); });
+}
+
+template <class Input, index_int Axis>
+using with_axis = decltype(compute_reduce_axis<Input, Axis>());
+
+template <class Derived>
+struct reducer_base
+{
+    template <class T>
+    __device__ decltype(auto) make_inner_slice(T&& x) const
+    {
+        if constexpr(is_inner_storage<T>{})
+        {
+            return x;
+        }
+        else
+        {
+            auto&& derived = static_cast<const Derived&>(*this);
+            auto t         = derived.slice(x);
+            return make_storage_access<typename decltype(t)::type>(
+                [=](auto i, auto...) -> auto& { return t[i]; });
+        }
+    }
+
+    template <class T, class... Ts>
+    constexpr auto get_size(T&& x, [[maybe_unused]] Ts&&... xs) const
+    {
+        MIGRAPHX_ASSERT(get_size(x) == get_size(xs...));
+        return get_size(x);
+    }
+
+    template <class T, class... Ts>
+    constexpr auto get_size(T&& x) const
+    {
+        if constexpr(is_inner_storage<T>{})
+        {
+            return x.rsize();
+        }
+        else
+        {
+            auto&& derived = static_cast<const Derived&>(*this);
+            auto t         = derived.slice(x);
+            return t.size();
+        }
+    }
+
+    template <class F>
+    __device__ auto inner_sliced(F f) const
+    {
+        return [=](auto&&... xs) { return f(get_size(xs...), make_inner_slice(xs)...); };
+    }
+
+    template <class T>
+    static __device__ typename T::type& decl_inner_storage(const T&);
+
+    template <class F>
+    __device__ auto inner(F f) const
+    {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
+            using result_type = decltype(f(decl_inner_storage(xs)...));
+            auto&& derived    = static_cast<const Derived&>(*this);
+            if constexpr(is_void<result_type>{})
+            {
+                derived.inner_void_impl(f, n, xs...);
+            }
+            else
+            {
+                return derived.template inner_impl<result_type>(f, n, xs...);
+            }
+        });
+    }
+
+    template <class F>
+    __device__ auto lazy_inner(F f) const
+    {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        });
+    }
+
+    template <class Op, class T, class Read>
+    __device__ auto reduce(Op op, T init, Read read) const
+    {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
+            auto&& derived = static_cast<const Derived&>(*this);
+            return derived.reduce_impl(op, init, read, n, xs...);
+        });
+    }
+
+    template <class Op, class T>
+    __device__ auto reduce(Op op, T init) const
+    {
+        return this->reduce(op, init, op::id{});
+    }
+
+    template <class F>
+    __device__ void outer(F f) const
+    {
+        f();
+    }
+
+    template <class Input>
+    constexpr auto elements() const
+    {
+        auto&& derived           = static_cast<const Derived&>(*this);
+        using reduce_type        = decltype(derived.slice(Input{}));
+        using value_type         = typename Input::type;
+        constexpr auto relements = get_shape_c<reduce_type>{}.elements();
+        if constexpr(vec_size<value_type>() > 1)
+            return relements * vec_size<value_type>();
+        else
+            return relements;
+    }
+};
+
+struct block
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
+    {
+        index idx;
+        Slicer slice;
+
+        template <class T, index_int N, class Size>
+        struct inner_storage : inner_storage_tag
+        {
+            using type = T;
+            array<T, N> arr;
+            constexpr Size rsize() const { return {}; }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d) const
+            {
+                return arr[d];
+            }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d)
+            {
+                return arr[d];
+            }
+        };
+
+        template <class Op, class T, class Read, class N, class... Ts>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
+        {
+            return block_reduce(idx, op, init, n, [&](auto j, auto d) {
+                return final_reduce(read(xs(j, d)...), op);
+            });
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local == 0)
+                f();
+        }
+
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            idx.local_stride(n, [&](auto j, auto d) { f(xs(j, d)...); });
+        }
+
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            using max_iterations = decltype(idx.max_local_stride_iterations(n));
+            inner_storage<R, max_iterations{}, N> storage;
+            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = R{f(xs(j, d)...)}; });
+            return storage;
+        }
+    };
+
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{{}, idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+struct block_large
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
+    {
+        index idx;
+        Slicer slice;
+
+        template <class Op, class T, class Read, class N, class... Ts>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
+        {
+            return block_reduce(idx, op, init, index_int{n}, [&](auto j, auto d) {
+                return final_reduce(read(xs(j, d)...), op);
+            });
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local == 0)
+                f();
+        }
+
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            idx.local_stride(index_int{n}, [&](auto j, auto d) { f(xs(j, d)...); });
+        }
+
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        }
+    };
+
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{{}, idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+template <unsigned int SubWaveSize>
+struct subwave
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
+    {
+        index idx;
+        Slicer slice;
+
+        template <class T, index_int N, class Size>
+        struct inner_storage : inner_storage_tag
+        {
+            using type = T;
+            array<T, N> arr;
+            constexpr Size rsize() const { return {}; }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d) const
+            {
+                return arr[d];
+            }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d)
+            {
+                return arr[d];
+            }
+        };
+
+        template <class Op, class T, class Read, class N, class... Ts>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
+        {
+            return subwave_reduce<SubWaveSize>(idx, op, init, n, [&](auto j, auto d) {
+                return final_reduce(read(xs(j, d)...), op);
+            });
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local_subwave<SubWaveSize>() == 0)
+                f();
+        }
+
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            idx.local_subwave_stride<SubWaveSize>(n, [&](auto j, auto d) { f(xs(j, d)...); });
+        }
+
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            using max_iterations =
+                decltype(idx.max_local_subwave_stride_iterations<SubWaveSize>(n));
+            inner_storage<R, max_iterations{}, N> storage;
+            idx.local_subwave_stride<SubWaveSize>(
+                n, [&](auto j, auto d) { storage(j, d) = f(xs(j, d)...); });
+            return storage;
+        }
+    };
+
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{{}, idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal_subwave<SubWaveSize>());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+using wave = subwave<MIGRAPHX_WAVEFRONTSIZE>;
+
+struct lane
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
+    {
+        index idx;
+        Slicer slice;
+
+        template <class Op, class T, class Read, class N, class U, class... Us>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
+        {
+            using type = remove_reference_t<decltype(read(x(0, _c<0>), xs(0, _c<0>)...))>;
+            type r     = type(init);
+            for(index_int j = 0; j < n; j++)
+            {
+                r = op(r, read(x(j, _c<0>), xs(j, _c<0>)...));
+            }
+            return r;
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            f();
+        }
+
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            for(index_int j = 0; j < n; j++)
+            {
+                f(xs(j, _c<0>)...);
+            }
+        }
+
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        }
+    };
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{{}, idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements, [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i);
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+// TODO: Remove these in the future when they can be selected in the compiler class
+template <index_int RElements>
+constexpr auto pick_block()
+{
+    using nlocal = decltype(index{}.max_nlocal());
+    if constexpr(RElements < nlocal{} * 256)
+        return block{};
+    else
+        return block_large{};
+}
+template <index_int RElements>
+using auto_block = decltype(pick_block<RElements>());
+
+template <class Input, index_int Axis>
+constexpr auto reduce_elements_with_axis()
+{
+    constexpr auto s = get_shape_c<Input>{};
+    return s.lens[Axis];
+}
+
+} // namespace reduce
+
+template <class Algo,
+          class Op,
+          class T,
+          class Input,
+          class Output,
+          class ReadInput,
+          class WriteOuput>
+__device__ void
+simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOuput write)
+{
+    Algo::template run<Output>([&](auto out_idx, auto r) {
+        auto x = r.reduce(op, init, read)(input);
+        r.outer([&] { output[out_idx] = write(x); });
+    });
+}
+
+template <class Algo, class Reduced, class Output, class Assign, class F>
+__device__ void fused_reduce(Output output_pack, Assign assign, F f)
+{
+    Algo::template run<Reduced>([&](auto out_idx, auto r) {
+        auto result_tuple = f(r, out_idx);
+        unpack_each(
+            [&](auto output, auto result) {
+                if constexpr(reduce::is_inner_storage<decltype(result)>{})
+                {
+                    r.inner([&](auto& y, auto x) { assign(y, x); })(output, result);
+                }
+                else
+                {
+                    r.outer([&] { assign(output[out_idx], implicit_conversion(result)); });
+                }
+            },
+            output_pack,
+            result_tuple);
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
new file mode 100644
index 000000000..b7d7216c6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -0,0 +1,229 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/dfor.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/array.hpp>
+
+namespace migraphx {
+
+struct max_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest{}; }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return max(x, y);
+    }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int)
+    {
+        return (x);
+    }
+};
+
+struct avg_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return x + y;
+    }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
+    {
+        return (y == 0) ? T{0.0} : T{x / y};
+    }
+};
+
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
+    const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
+{
+    array<int, 2> low{};
+    array<int, 2> high{};
+    for(index_int ii = 0; ii < xy.size(); ++ii)
+    {
+        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
+        {
+            return implicit_conversion(0);
+        }
+
+        xy[ii]   = migraphx::max(xy[ii], 0.0f);
+        low[ii]  = xy[ii];
+        high[ii] = low[ii] + 1;
+        if(low[ii] >= dims[ii] - 1)
+        {
+            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+        }
+    }
+    array<index_int, 4> locs = {low[0] * dims[1] + low[1],
+                                low[0] * dims[1] + high[1],
+                                high[0] * dims[1] + low[1],
+                                high[0] * dims[1] + high[1]};
+
+    float ly = xy[0] - low[0];
+    float lx = xy[1] - low[1];
+    float hy = 1.0f - ly;
+    float hx = 1.0f - lx;
+    // do calculations in floating point and convert final result to required type
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    return implicit_conversion(pooling(v01, v23));
+}
+
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
+                                            const array<float, 2>& roi_starts,
+                                            const array<float, 2>& bin_size,
+                                            const array<int, 2>& idx,
+                                            const array<index_int, 2>& bin_grid_size,
+                                            const array<index_int, 2>& dims,
+                                            float roi_offset,
+                                            Op op)
+{
+    using in_dtype      = typename Iterator::value_type;
+    in_dtype output_val = in_dtype{op.init()};
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+        array<index_int, 2> id = {iy, ix};
+        array<float, 2> locs =
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
+
+        auto val   = bilinear_interpolate(data, dims, locs, op);
+        output_val = op(output_val, val);
+    });
+    return op.final(output_val, count);
+}
+
+template <class T1, class T2, class T3, class T4>
+struct roalign_settings
+{
+    T1 roi_offset{};
+    T2 is_avg_pooling{};
+    T3 sampling_ratio{};
+    T4 spatial_scale{};
+};
+
+template <class... Ts>
+constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
+{
+    return {xs...};
+}
+
+template <class T, class U, class V, class W, class Settings>
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
+{
+    auto index      = make_index();
+    const auto x    = x_t.begin();
+    const auto rois = rois_t.begin();
+    const auto ind  = ind_t.begin();
+    // input shape
+    auto x_lens      = x_t.get_shape().lens;
+    auto channel_num = x_lens[1];
+    // input dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};
+
+    const auto stride   = index.nglobal();
+    auto out_s          = y_t.get_shape();
+    auto roi_column_num = rois_t.get_shape().lens[1];
+
+    // output dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    const auto& out_lens         = out_s.lens;
+    array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
+
+    for(index_int i = index.global; i < out_s.elements(); i += stride)
+    {
+        auto idx = out_s.multi(i);
+        int n    = idx[0];
+        int c    = idx[1];
+        int ph   = idx[2];
+        int pw   = idx[3];
+
+        const auto offset_rois = rois + (n * roi_column_num);
+        const int batch_ind    = ind[n];
+
+        array<float, 2> roi_starts = {
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
+        array<float, 2> roi_ends = {
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
+
+        array<float, 2> roi_size{};
+        array<float, 2> bin_size{};
+        array<index_int, 2> bin_grid_size{};
+
+        for(index_int ii = 0; ii < roi_size.size(); ++ii)
+        {
+            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+            roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
+
+            bin_size[ii]      = roi_size[ii] / out_dims[ii];
+            bin_grid_size[ii] = (s.sampling_ratio > 0)
+                                    ? s.sampling_ratio
+                                    : migraphx::ceil(roi_size[ii] / out_dims[ii]);
+        }
+
+        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        if constexpr(s.is_avg_pooling)
+        {
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  avg_pool{});
+        }
+        else
+        {
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  max_pool{});
+        }
+    }
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter.hpp
new file mode 100644
index 000000000..efe5fe347
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter.hpp
@@ -0,0 +1,70 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_SCATTER_ELEMENTS_HPP
+#define MIGRAPHX_GUARD_KERNELS_SCATTER_ELEMENTS_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/scatter_reduction_modes.hpp>
+
+namespace migraphx {
+
+// Checks and skips out of bounds indices if SkipOutOfBounds is true.
+// Otherwise does not check and underfined behavior if out of bounds.
+template <uint64_t Axis, bool SkipOutOfBounds, class T, class U, class V, class F>
+__device__ void scatter(const T& indices_t, const U& updates_t, const V& output_t, F f)
+{
+    auto gpu_index     = make_index();
+    auto indices_shape = indices_t.get_shape();
+    auto output_shape  = output_t.get_shape();
+    auto axis_dim_size = output_shape.lens[Axis];
+
+    gpu_index.global_stride(indices_shape.elements(), [&](auto i) {
+        auto out_idx  = indices_shape.multi(i);
+        auto index    = indices_t[i];
+        index         = index < 0 ? index + axis_dim_size : index;
+        if constexpr(SkipOutOfBounds)
+        {
+            if(index < 0)
+            {
+                return;
+            }
+        }
+        out_idx[Axis] = index;
+        if constexpr(SkipOutOfBounds)
+        {
+            if(not equal(
+                   out_idx.begin(), out_idx.end(), output_shape.lens.begin(), [](auto x, auto y) {
+                       return x < y;
+                   }))
+            {
+                return;
+            }
+        }
+        f(output_t[out_idx], updates_t[i]);
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter_reduction_modes.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter_reduction_modes.hpp
new file mode 100644
index 000000000..166552a84
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatter_reduction_modes.hpp
@@ -0,0 +1,79 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_SCATTER_REDUCTION_MODES_HPP
+#define MIGRAPHX_GUARD_KERNELS_SCATTER_REDUCTION_MODES_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/atomic.hpp>
+
+namespace migraphx {
+
+struct assign_none
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        x = y;
+    }
+};
+
+struct assign_add
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomic_assign(x, y, op::sum{});
+    }
+};
+
+struct assign_mul
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomic_assign(x, y, op::product{});
+    }
+};
+
+struct assign_max
+{
+    template <typename T, typename U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomic_assign(x, y, op::max{});
+    }
+};
+
+struct assign_min
+{
+    template <typename T, typename U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomic_assign(x, y, op::min{});
+    }
+};
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
new file mode 100644
index 000000000..dee649e8c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
@@ -0,0 +1,61 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
+#define MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/scatter_reduction_modes.hpp>
+
+namespace migraphx {
+
+template <class T, class U, class V, class F>
+__device__ void scatternd(const T& indices_t, const U& updates_t, const V& output_t, F f)
+{
+    auto index         = make_index();
+    auto updates_shape = updates_t.get_shape();
+
+    index.global_stride(updates_shape.elements(), [&](auto i) {
+        auto output_shape = output_t.get_shape();
+
+        auto indices_shape = indices_t.get_shape();
+        auto k             = indices_shape.lens.back();
+        auto q             = indices_shape.lens.size();
+
+        auto updates_idx = updates_shape.multi(i);
+        auto indices_idx = indices_shape.multi(0);
+        copy(updates_idx.begin(), updates_idx.begin() + q - 1, indices_idx.begin());
+
+        auto index_start = indices_t.begin() + indices_shape.index(indices_idx);
+        auto index_end   = index_start + k;
+        auto out_idx     = output_shape.multi(0);
+        copy(index_start, index_end, out_idx.begin());
+        copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
+
+        f(output_t[out_idx], updates_t[i]);
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/shape.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
new file mode 100644
index 000000000..0828328d7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -0,0 +1,201 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_SHAPE_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_SHAPE_HPP
+
+#include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/permutation.hpp>
+#include <migraphx/kernels/operators.hpp>
+
+namespace migraphx {
+
+template <class Lens, class Strides>
+struct shape : equality_comparable<shape<Lens, Strides>>
+{
+    using shape_type  = shape;
+    using index_array = typename Lens::base_array;
+    Lens lens         = {};
+    Strides strides   = {};
+
+    constexpr shape() = default;
+
+    constexpr shape(Lens l, Strides s) : lens(l), strides(s) {}
+
+    constexpr auto elements() const { return _c<Lens{}.product()>; }
+
+    constexpr auto element_space() const { return _c<Strides{}.dot(Lens{} - 1) + 1>; }
+
+    constexpr auto packed() const { return not skips() and elements() == element_space(); }
+    constexpr auto broadcasted() const { return _c<Strides{}.product() == 0>; }
+    constexpr auto transposed() const
+    {
+        return return_c([] {
+            auto lstrides = Strides{};
+            if(shape{}.broadcasted())
+            {
+                index_array s{};
+                auto out = copy_if(
+                    lstrides.begin(), lstrides.end(), s.begin(), [](auto x) { return x != 0; });
+                return not is_sorted(s.begin(), out, greater{});
+            }
+            else
+            {
+                return not is_sorted(lstrides.begin(), lstrides.end(), greater{});
+            }
+        });
+    }
+    constexpr auto skips() const
+    {
+        return return_c([] {
+            auto lstrides = Strides{};
+            return none_of(lstrides.begin(), lstrides.end(), [](auto x) { return x == 1; });
+        });
+    }
+
+    constexpr auto standard() const { return packed() and not transposed(); }
+
+    constexpr index_int index(index_array x) const { return x.dot(strides); }
+
+    constexpr index_int index(index_int i) const
+    {
+        if(this->standard())
+        {
+            MIGRAPHX_ASSERT(i == compute_index(i));
+            return i;
+        }
+        else
+        {
+            return compute_index(i);
+        }
+    }
+
+    constexpr index_int compute_index(index_int i) const
+    {
+        const auto rank  = this->lens.size();
+        index_int s      = 1;
+        index_int result = 0;
+        for(index_int j = 0; j < rank; j++)
+        {
+            const index_int k      = rank - j - 1;
+            const index_int stride = this->strides[k];
+            const index_int len    = this->lens[k];
+            const index_int slen   = s * len;
+            const index_int idx    = (i % slen) / s;
+            result += stride * idx;
+            s = slen;
+        }
+        return result;
+    }
+
+    /// Convert single index into a multi-index
+    constexpr index_array multi(index_int idx) const { return lens.multi(idx); }
+
+    /// Convert multi-index into a single index
+    constexpr index_int single(index_array idx) const
+    {
+        if(idx.empty())
+            return 0;
+        return inner_product(lens.begin() + 1, lens.end(), idx.begin(), idx.back());
+    }
+
+    constexpr shape get_shape() const { return *this; }
+
+    template <class... Ts>
+    friend constexpr bool operator==(const shape& x, const shape<Ts...>& y)
+    {
+        return x.lens == y.lens and x.strides == y.strides;
+    }
+
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& ss, const shape& s)
+    {
+        ss << "{" << s.lens << "}, {" << s.strides << "}";
+        return ss;
+    }
+};
+
+template <class Lens>
+constexpr auto calculate_strides(Lens)
+{
+    return return_array_c([] {
+        Lens lens{};
+        array<typename Lens::value_type, Lens{}.size()> strides{1};
+        const auto n     = lens.size() - 1;
+        index_int stride = 1;
+        for(index_int i = 0; i < n; i++)
+        {
+            auto ri = n - i;
+            stride *= lens[ri];
+            strides[ri - 1] = stride;
+        }
+        return strides;
+    });
+}
+
+template <class Lens, class Strides>
+constexpr shape<Lens, Strides> make_shape(Lens lens, Strides strides)
+{
+    return {lens, strides};
+}
+
+template <class Lens>
+constexpr auto make_shape(Lens lens)
+{
+    return make_shape(lens, calculate_strides(lens));
+}
+
+template <class Shape, class Permutation>
+constexpr auto reorder_shape(Shape, Permutation)
+{
+    constexpr auto lens = return_array_c([] { return reorder_dims(Shape{}.lens, Permutation{}); });
+    constexpr auto strides =
+        return_array_c([] { return reorder_dims(Shape{}.strides, Permutation{}); });
+    return make_shape(lens, strides);
+}
+
+template <class Lens, class Permutation>
+constexpr auto make_shape_from_permutation(Lens, Permutation)
+{
+    constexpr auto new_lens = reorder_dims(Lens{}, Permutation{});
+    return reorder_shape(make_shape(new_lens), invert_permutation(Permutation{}));
+}
+
+template <class Shape>
+constexpr auto make_packed_shape(Shape)
+{
+    constexpr auto s = Shape{};
+    if constexpr(s.packed())
+    {
+        return s;
+    }
+    else
+    {
+        return make_shape_from_permutation(s.lens, find_permutation(s));
+    }
+}
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
new file mode 100644
index 000000000..e9c2ac36f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_SOFTMAX_HPP
+#define MIGRAPHX_GUARD_KERNELS_SOFTMAX_HPP
+
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/ops.hpp>
+
+namespace migraphx {
+
+template <index_int Axis, class Input, class Output>
+__device__ void softmax(Input input1, Output output)
+{
+    using block = reduce::auto_block<reduce::reduce_elements_with_axis<Input, Axis>()>;
+    block::template run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
+        auto x = r.inner(op::id{})(input1);
+#ifdef MIGRAPHX_USE_FAST_SOFTMAX
+        const auto c = vec_at(r.slice(input1)[0], 0);
+#else
+        const auto c = r.reduce(op::max{}, lowest{}, op::id{})(x);
+#endif
+        r.inner([&](auto& x1) { x1 = migraphx::exp(x1 - c); })(x);
+        auto batch_sum =
+            r.reduce(op::sum{}, 0, [](auto x1) { return migraphx::convert<float>(x1); })(x);
+        r.inner([&](auto& y, auto x1) { y = implicit_conversion(x1 / batch_sum); })(output, x);
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_SOFTMAX_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
new file mode 100644
index 000000000..e959ed6be
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -0,0 +1,113 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
+#define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
+
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/iota_iterator.hpp>
+#include <migraphx/kernels/float8.hpp>
+
+namespace migraphx {
+
+template <class T>
+struct tensor_view_iterator_read
+{
+    T* view;
+    constexpr auto& operator()(MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_int) n) const
+    {
+        MIGRAPHX_ASSERT(view != nullptr);
+        return (*view)[n];
+    }
+};
+
+template <class T, class Shape>
+struct tensor_view
+{
+    using type        = T;
+    using shape_type  = Shape;
+    using index_array = typename Shape::index_array;
+    using iterator = basic_iota_iterator<tensor_view_iterator_read<const tensor_view>, index_int>;
+
+    constexpr Shape get_shape() const { return Shape{}; }
+    constexpr auto size() const { return get_shape().elements(); }
+
+    struct index_to_offset
+    {
+        index_int offset;
+        template <class U>
+        constexpr index_to_offset(U i) : offset(Shape{}.index(i))
+        {
+        }
+    };
+
+    constexpr T& operator[](MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_to_offset) i) const
+    {
+        index_to_offset ito = i;
+        MIGRAPHX_WARN(ito.offset < get_shape().element_space(),
+                      i,
+                      "Out of bounds access at offset: ",
+                      ito.offset);
+        return x[ito.offset];
+    }
+
+    constexpr T* data() const { return x; }
+
+    constexpr auto begin() const { return iterator{0, {this}}; }
+    constexpr auto end() const { return iterator{this->size(), {this}}; }
+
+    constexpr auto begin_at(index_array i) const
+    {
+        MIGRAPHX_ASSERT(get_shape().single(i) < get_shape().elements());
+        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
+        return iterator{get_shape().single(i), {this}};
+    }
+
+    template <class U>
+    constexpr tensor_view<U, Shape> with(U* y) const
+    {
+        static_assert(sizeof(T) == sizeof(U), "Not the same size");
+        return {y};
+    }
+
+    T* x;
+};
+
+template <class T>
+using get_shape_c = typename T::shape_type;
+
+template <class T, class Shape>
+constexpr tensor_view<T, Shape> make_tensor_view(T* x, Shape)
+{
+    return {x};
+}
+
+template <class T, class Permutation>
+constexpr auto reorder_tensor_view(T x, Permutation perm)
+{
+    return make_tensor_view(x.data(), reorder_shape(x.get_shape(), perm));
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tile.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
new file mode 100644
index 000000000..1f11b214f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
@@ -0,0 +1,168 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_TILE_HPP
+#define MIGRAPHX_GUARD_KERNELS_TILE_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/copy.hpp>
+
+namespace migraphx {
+
+struct tile
+{
+    template <class Shape>
+    static constexpr auto pad_shape(Shape)
+    {
+        constexpr Shape s{};
+        constexpr auto axis    = s.strides.size() - _c<1>;
+        constexpr auto strides = transform_i(s.strides, [](auto stride, auto i) {
+            if constexpr(i == decltype(axis){})
+            {
+                // Pad by 1 element extra to avoid memory bank conflicts
+                return stride + 1;
+            }
+            else
+            {
+                return stride;
+            }
+        });
+        return make_shape(s.lens, strides);
+    }
+    struct load
+    {
+        template <class T>
+        static __device__ auto copy(index idx, T x)
+        {
+            return [=](auto f) {
+                using type          = typename T::type;
+                constexpr auto s    = pad_shape(make_packed_shape(get_shape_c<T>{}));
+                constexpr auto size = s.element_space();
+                __shared__ type buffer[size];
+                auto b = make_tensor_view(buffer, s);
+                local_tensor_copy(idx, x, b);
+                f(b);
+            };
+        }
+    };
+    struct store
+    {
+        template <class T>
+        static __device__ auto copy(index idx, T x)
+        {
+            return [=](auto f) {
+                using type          = typename T::type;
+                constexpr auto s    = pad_shape(make_packed_shape(get_shape_c<T>{}));
+                constexpr auto size = s.element_space();
+                __shared__ type buffer[size];
+                auto b = make_tensor_view(buffer, s);
+                f(b);
+                local_tensor_copy(idx, b, x);
+            };
+        }
+    };
+    struct none
+    {
+        template <class T>
+        static __device__ auto copy(index, T x)
+        {
+            return [=](auto f) { f(x); };
+        }
+    };
+
+    template <class T, class InnerLens, class OuterLens>
+    static constexpr auto slice(T x, index_int group, InnerLens, OuterLens)
+    {
+        constexpr auto outer_strides =
+            transform(x.get_shape().strides, InnerLens{}, [](auto stride, auto inner_len) {
+                return stride * inner_len;
+            });
+        constexpr auto is = make_shape(InnerLens{}, x.get_shape().strides);
+        constexpr auto os = make_shape(OuterLens{}, outer_strides);
+        auto offset       = os.index(group);
+        MIGRAPHX_ASSERT((os.element_space() + is.element_space()) ==
+                        (x.get_shape().element_space() + _c<1>));
+        MIGRAPHX_ASSERT((is.elements() + group) <= x.get_shape().elements());
+        MIGRAPHX_ASSERT((is.element_space() + offset) <= x.get_shape().element_space());
+        return make_tensor_view(x.data() + offset, is);
+    }
+
+    template <class InnerLens, class OuterLens>
+    static __device__ auto auto_slice(index idx)
+    {
+        return make_transform([=](auto f, auto... xs) {
+            idx.group_stride(OuterLens{}.product(),
+                             [=](auto group) { f(slice(xs, group, InnerLens{}, OuterLens{})...); });
+        });
+    }
+
+    template <class... Modes>
+    static __device__ auto auto_copy(index idx)
+    {
+        return make_transform([=](auto f, auto... xs) {
+            static_assert(sizeof...(Modes) == sizeof...(xs));
+            auto invoke = [=](auto... ys) {
+                if constexpr((is_same<Modes, load>{} or ...))
+                    __syncthreads();
+                f(ys...);
+                if constexpr((is_same<Modes, store>{} or ...))
+                    __syncthreads();
+            };
+            join(invoke, Modes::copy(idx, xs)...);
+        });
+    }
+};
+
+template <bool Tiled>
+__device__ auto tile_stride(index idx)
+{
+    if constexpr(Tiled)
+    {
+        return [=](auto... xs) { return idx.local_stride(xs...); };
+    }
+    else
+    {
+        return [=](auto... xs) { return idx.global_stride(xs...); };
+    }
+}
+
+template <class... Modes, class InnerLens, class OuterLens>
+__device__ auto auto_tile(InnerLens, OuterLens)
+{
+    if constexpr((is_same<Modes, tile::none>{} and ...))
+    {
+        return transform_args();
+    }
+    else
+    {
+        auto idx = make_index();
+        return transform_args(tile::auto_slice<InnerLens, OuterLens>(idx),
+                              tile::auto_copy<Modes...>(idx));
+    }
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_TILE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tuple.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tuple.hpp
new file mode 100644
index 000000000..c54a9f4d3
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/tuple.hpp
@@ -0,0 +1,184 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_TUPLE_HPP
+#define MIGRAPHX_GUARD_KERNELS_TUPLE_HPP
+
+#include <migraphx/kernels/array.hpp>
+
+namespace migraphx {
+
+namespace tuple_detail {
+
+template <class T, index_int N>
+struct element_storage
+{
+    [[no_unique_address]] T element;
+};
+
+template <index_int N, class T>
+constexpr const auto& get_element(const element_storage<T, N>& x)
+{
+    return x.element;
+}
+
+template <index_int N, class T>
+constexpr auto& get_element(element_storage<T, N>& x)
+{
+    return x.element;
+}
+
+struct unpack_t
+{
+};
+
+template <class... Ts>
+struct tuple_storage;
+
+template <index_int... Ns, class... Ts>
+struct tuple_storage<detail::seq<Ns...>, Ts...> : element_storage<Ts, Ns>...
+{
+    template <class... Us, MIGRAPHX_REQUIRES(sizeof...(Us) == sizeof...(Ts))>
+    constexpr tuple_storage(Us... ys) : element_storage<Ts, Ns>{static_cast<Ts>(ys)}...
+    {
+    }
+
+    template <class U>
+    constexpr tuple_storage(unpack_t, U y) : element_storage<Ts, Ns>{static_cast<Ts>(y[_c<Ns>])}...
+    {
+    }
+
+    template <class F>
+    constexpr auto operator()(F f) const
+    {
+        return f(static_cast<const element_storage<Ts, Ns>&>(*this).element...);
+    }
+
+    template <class F>
+    constexpr auto operator()(F f)
+    {
+        return f(static_cast<element_storage<Ts, Ns>&>(*this).element...);
+    }
+
+    template <class IntegralConstant>
+    constexpr auto& operator[](IntegralConstant i)
+    {
+        static_assert(i < sizeof...(Ts), "Out of bounds tuple access");
+        return get_element<i>(*this);
+    }
+
+    template <class IntegralConstant>
+    constexpr auto& operator[](IntegralConstant i) const
+    {
+        static_assert(i < sizeof...(Ts), "Out of bounds tuple access");
+        return get_element<i>(*this);
+    }
+
+    constexpr index_constant<sizeof...(Ts)> size() const { return {}; }
+    constexpr auto empty() const { return size() == _c<0>; }
+};
+
+template <class... Ts>
+using tuple_base = tuple_detail::tuple_storage<typename detail::gens<sizeof...(Ts)>::type, Ts...>;
+
+} // namespace tuple_detail
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_TUPLE_OP(op, binary_op)                                                  \
+    template <class... Us>                                                                       \
+    constexpr tuple& operator op(const tuple<Us...>& rhs)                                        \
+    {                                                                                            \
+        (*this)(                                                                                 \
+            [&](auto&... xs) { rhs([&](const auto&... ys) { swallow{((xs op ys), 0)...}; }); }); \
+        return *this;                                                                            \
+    }                                                                                            \
+    template <class... Us>                                                                       \
+    friend constexpr auto operator binary_op(const tuple& lhs, const tuple<Us...>& rhs)          \
+    {                                                                                            \
+        using result = tuple<decltype(declval<Ts>() binary_op declval<Us>())...>;                \
+        return lhs([&](auto&... xs) {                                                            \
+            return rhs([&](const auto&... ys) { return result{xs binary_op ys...}; });           \
+        });                                                                                      \
+    }
+
+template <class... Ts>
+struct tuple : tuple_detail::tuple_base<Ts...>
+{
+    using base = tuple_detail::tuple_base<Ts...>;
+
+    constexpr tuple() : base(Ts{}...) {}
+
+    template <class... Us,
+              MIGRAPHX_REQUIRES(sizeof...(Us) == sizeof...(Ts) and
+                                (is_convertible<Us, Ts>{} and ...))>
+    constexpr tuple(Us... ys) : base(ys...)
+    {
+    }
+
+    template <class... Us,
+              MIGRAPHX_REQUIRES(sizeof...(Us) == sizeof...(Ts) and
+                                (is_convertible<Us, Ts>{} and ...))>
+    constexpr tuple(tuple<Us...> y) : base(tuple_detail::unpack_t{}, y)
+    {
+    }
+
+    MIGRAPHX_DEVICE_TUPLE_OP(+=, +)
+    MIGRAPHX_DEVICE_TUPLE_OP(-=, -)
+    MIGRAPHX_DEVICE_TUPLE_OP(*=, *)
+    MIGRAPHX_DEVICE_TUPLE_OP(/=, /)
+    MIGRAPHX_DEVICE_TUPLE_OP(%=, %)
+    MIGRAPHX_DEVICE_TUPLE_OP(&=, &)
+    MIGRAPHX_DEVICE_TUPLE_OP(|=, |)
+    MIGRAPHX_DEVICE_TUPLE_OP(^=, ^)
+
+    friend constexpr bool operator==(const tuple& x, const tuple& y)
+    {
+        return x([&](const auto&... xs) {
+            return y([&](const auto&... ys) { return ((xs == ys) and ...); });
+        });
+    }
+    friend constexpr bool operator!=(const tuple& x, const tuple& y) { return not(x == y); }
+    friend constexpr bool operator<(const tuple& x, const tuple& y)
+    {
+        return x([&](const auto&... xs) {
+                   return y([&](const auto&... ys) {
+                       return fold([&](auto a, auto b) { return a == 0 ? b() : a; })(0, [&] {
+                           return (xs < ys) ? -1 : (ys < xs) ? 1 : 0;
+                       }...);
+                   });
+               }) < 0;
+    }
+    friend constexpr bool operator>(const tuple& x, const tuple& y) { return y < x; }
+    friend constexpr bool operator<=(const tuple& x, const tuple& y) { return not(x > y); }
+    friend constexpr bool operator>=(const tuple& x, const tuple& y) { return not(x < y); }
+};
+
+template <class... Ts>
+constexpr tuple<Ts...> make_tuple(Ts... xs)
+{
+    return {xs...};
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_TUPLE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
new file mode 100644
index 000000000..24b7d4a5b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -0,0 +1,289 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+
+namespace migraphx {
+
+template <class...>
+using void_t = void;
+
+template <class T, class U = T&&>
+U private_declval(int);
+
+template <class T>
+T private_declval(long);
+
+template <class T>
+auto declval() noexcept -> decltype(private_declval<T>(0));
+
+template <class Void, class F, class... Ts>
+struct is_callable_impl : false_type
+{
+};
+
+template <class F, class... Ts>
+struct is_callable_impl<void_t<decltype(declval<F>()(declval<Ts>()...))>, F, Ts...> : true_type
+{
+};
+
+template <class F, class... Ts>
+using is_callable = is_callable_impl<void, F, Ts...>;
+
+template <class T>
+struct type_identity
+{
+    using type = T;
+};
+
+template <bool B, class T = void>
+struct enable_if
+{
+};
+
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+
+template <bool B, class T, class F>
+using conditional_t = typename conditional<B, T, F>::type;
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAIT1(name)   \
+    template <class T>                       \
+    struct name : bool_constant<__##name(T)> \
+    {                                        \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAIT2(name)      \
+    template <class T, class U>                 \
+    struct name : bool_constant<__##name(T, U)> \
+    {                                           \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAITN(name)       \
+    template <class... Ts>                       \
+    struct name : bool_constant<__##name(Ts...)> \
+    {                                            \
+    }
+
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_arithmetic);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_destructible);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_nothrow_destructible);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pointer);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_scalar);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_signed);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_void);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_abstract);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_aggregate);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_array);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_class);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_compound);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_const);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_empty);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_enum);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_final);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_floating_point);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_function);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_fundamental);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_integral);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_literal_type);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_lvalue_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_function_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_object_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_object);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pod);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_polymorphic);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_rvalue_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_standard_layout);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivial);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_destructible);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_union);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_unsigned);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_volatile);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_base_of);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_convertible);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_nothrow_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_same);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_trivially_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_nothrow_constructible);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_trivially_constructible);
+
+template <class T>
+struct remove_cv
+{
+    using type = T;
+};
+
+template <class T>
+struct remove_cv<const T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_cv<volatile T> : remove_cv<T>
+{
+};
+
+template <class T>
+using remove_cv_t = typename remove_cv<T>::type;
+
+template <class T>
+struct remove_reference
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    using type = T;
+};
+
+template <class T>
+using remove_reference_t = typename remove_reference<T>::type;
+
+template <class T>
+struct add_pointer : type_identity<typename remove_reference<T>::type*>
+{
+};
+
+template <class T>
+using add_pointer_t = typename add_pointer<T>::type;
+
+template <class T>
+struct is_void : is_same<void, remove_cv_t<T>>
+{
+};
+
+template <class... Ts>
+struct common_type;
+
+template <class T>
+struct common_type<T>
+{
+    using type = T;
+};
+
+template <class T, class U>
+struct common_type<T, U>
+{
+    using type = decltype(true ? declval<T>() : declval<U>());
+};
+
+template <class T, class U, class... Us>
+struct common_type<T, U, Us...>
+{
+    using type = typename common_type<typename common_type<T, U>::type, Us...>::type;
+};
+
+template <class... Ts>
+using common_type_t = typename common_type<Ts...>::type;
+
+#define MIGRAPHX_REQUIRES(...) enable_if_t<__VA_ARGS__, int> = 0
+
+constexpr unsigned long long int_max(unsigned long n)
+{
+    // Note, left shift cannot be used to get the maximum value of int64_type or
+    // uint64_type because it is undefined behavior to left shift 64 bits for
+    // these types
+    if(n == sizeof(int64_t))
+        return -1;
+    return (1ull << (n * 8)) - 1;
+}
+
+template <class T,
+          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
+                            is_same<T, migraphx::half>{})>
+constexpr T numeric_max()
+{
+    if constexpr(is_integral<T>{})
+    {
+        if constexpr(is_unsigned<T>{})
+            return int_max(sizeof(T));
+        else
+            return int_max(sizeof(T)) / 2;
+    }
+    else if constexpr(is_same<T, double>{})
+        return __DBL_MAX__;
+    else if constexpr(is_same<T, float>{})
+        return __FLT_MAX__;
+    else if constexpr(is_same<T, migraphx::half>{})
+        return __FLT16_MAX__;
+    else if constexpr(is_same<T, migraphx::bf16>{})
+        return 338953138925153547590470800371487866880.000000;
+    else
+        return 0;
+}
+
+template <class T>
+constexpr auto numeric_lowest() -> decltype(numeric_max<T>())
+{
+    if constexpr(is_integral<T>{})
+    {
+        if constexpr(is_unsigned<T>{})
+            return 0;
+        else
+            return -numeric_max<T>() - 1;
+    }
+    else
+    {
+        return -numeric_max<T>();
+    }
+}
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/types.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/types.hpp
new file mode 100644
index 000000000..c88343ce1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -0,0 +1,83 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
+
+#include <migraphx/kernels/hip.hpp>
+
+namespace migraphx {
+
+#if defined(MIGRAPHX_USE_HIPRTC)
+using int8_t   = signed char;
+using uint8_t  = unsigned char;
+using int16_t  = signed short;
+using uint16_t = unsigned short;
+using int32_t  = signed int;
+using uint32_t = unsigned int;
+using int64_t  = signed long long;
+using uint64_t = unsigned long long;
+#elif defined(MIGRAPHX_USE_HIPRTC)
+using int8_t   = __hip_int8_t;
+using uint8_t  = __hip_uint8_t;
+using int16_t  = __hip_int16_t;
+using uint16_t = __hip_uint16_t;
+using int32_t  = __hip_int32_t;
+using uint32_t = __hip_uint32_t;
+using int64_t  = __hip_int64_t;
+using uint64_t = __hip_uint64_t;
+#else
+using int8_t   = std::int8_t;
+using uint8_t  = std::uint8_t;
+using int16_t  = std::int16_t;
+using uint16_t = std::uint16_t;
+using int32_t  = std::int32_t;
+using uint32_t = std::uint32_t;
+using int64_t  = std::int64_t;
+using uint64_t = std::uint64_t;
+#endif // MIGRAPHX_USE_HIPRTC
+using index_int = uint32_t;
+using diff_int  = int32_t;
+using uintptr_t = uint64_t;
+
+static_assert(sizeof(int8_t) == 1, "int8_t must be 1 bytes");
+static_assert(sizeof(uint8_t) == 1, "uint8_t must be 1 bytes");
+static_assert(sizeof(int16_t) == 2, "int16_t must be 2 bytes");
+static_assert(sizeof(uint16_t) == 2, "uint16_t must be 2 bytes");
+static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes");
+static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes");
+static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes");
+static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes");
+
+#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+
+template <class T, index_int N>
+using vec = T __attribute__((ext_vector_type(N)));
+
+using half  = _Float16;
+using half2 = migraphx::vec<half, 2>;
+using bf16  = __bf16;
+
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/unpack_int4.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/unpack_int4.hpp
new file mode 100644
index 000000000..35ffcff7a
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/unpack_int4.hpp
@@ -0,0 +1,57 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_UNPACK_INT4_HPP
+#define MIGRAPHX_GUARD_KERNELS_UNPACK_INT4_HPP
+
+#include "migraphx/kernels/types.hpp"
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <int Axis, class Output, class Input>
+__device__ void unpack_int4(Output output, Input input)
+{
+    const auto input_shape = input.get_shape();
+
+    make_index().global_stride(input_shape.elements(), [&](auto i) {
+        auto idx = input_shape.multi(i);
+        idx[Axis] *= 2;
+        const auto input_val = input[i];
+
+        // unpack_int4 op's normalize_compute_shape will ensure that Input::type is either uint8_t
+        // or int8_t
+        if constexpr(is_unsigned<typename Input::type>{})
+            output[idx] = input_val & 0xfu;
+        else
+            // NOLINTNEXTLINE (hicpp-signed-bitwise)
+            output[idx] = static_cast<int8_t>(static_cast<uint8_t>(input_val) << 4) >> 4;
+
+        idx[Axis] += 1;
+        output[idx] = input_val >> 4;
+    });
+}
+
+} // namespace migraphx
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vec.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
new file mode 100644
index 000000000..ae453ff40
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -0,0 +1,228 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_VEC_HPP
+#define MIGRAPHX_GUARD_KERNELS_VEC_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/debug.hpp>
+
+namespace migraphx {
+
+template <class T, index_int N>
+constexpr auto vec_size(vec<T, N>)
+{
+    return index_constant<N>{};
+}
+
+template <class T>
+constexpr auto vec_size(T, ...) // NOLINT
+{
+    return index_constant<0>{};
+}
+
+template <class T>
+constexpr auto vec_size()
+{
+    return decltype(vec_size(T{})){};
+}
+
+template <class... Ts>
+constexpr auto is_any_vec()
+{
+    if constexpr(sizeof...(Ts) == 0)
+        return false_type{};
+    else
+        return bool_constant<((vec_size<Ts>() + ...) > 0)>{};
+}
+
+template <class T, class I>
+constexpr auto vec_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return x;
+    else
+    {
+        MIGRAPHX_ASSERT(i < vec_size<T>());
+        return x[i];
+    }
+}
+
+template <class T>
+using vec_type = decltype(vec_at(T{}, 0));
+
+template <class... Ts>
+constexpr auto common_vec_size()
+{
+    return fold([](auto x, auto y) {
+        if constexpr(x > y)
+            return x;
+        else
+            return y;
+    })(vec_size<Ts>()...);
+}
+
+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
+
+template <index_int N, class T>
+__device__ __host__ auto as_vec(T* x)
+{
+    if constexpr(N < 2)
+        return x;
+    else
+        return reinterpret_cast<vec<T, N>*>(x);
+}
+
+template <class T, index_int N>
+using safe_vec = vec<conditional_t<is_same<T, bool>{}, uint8_t, T>, N>;
+
+template <class... Ts>
+constexpr auto vec_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type                  = decltype(f(vec_at(xs, 0)...));
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
+            for(int i = 0; i < size; i++)
+                result[i] = f(vec_at(xs, i)...);
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+
+// Return a vector type of N from index i in another larger vector
+// N will be 2 for half2 packing
+template <index_int N, class T, class I>
+constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return vec<T, N>{x};
+    else
+    {
+        MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
+        vec<vec_type<T>, N> result = {0};
+        for(int j = 0; j < N; j++)
+        {
+            result[j] = x[i + j];
+        }
+        return result;
+    }
+}
+
+template <index_int N, class... Ts>
+constexpr auto vec_packed_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type                  = vec_type<decltype(f(vec_packed_at<N>(xs, 0)...))>;
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
+            for(int i = 0; i < size / N; i++)
+            {
+                // Call the function with packed vectors
+                safe_vec<type, N> r = f(vec_packed_at<N>(xs, i * N)...);
+                // Copy the packed vectors to the result
+                for(int j = 0; j < N; j++)
+                    result[i * N + j] = r[j];
+            }
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+
+template <class T, class Op>
+constexpr auto vec_reduce(T x, Op op)
+{
+    if constexpr(vec_size<T>() < 2)
+        return vec_type<T>{x};
+    else
+    {
+        vec_type<T> result = x[0];
+        for(int i = 1; i < vec_size<T>(); i++)
+            result = op(result, x[i]);
+        return result;
+    }
+}
+
+template <index_int N, class F>
+constexpr auto vec_generate(F f)
+{
+    using type = decltype(f(_c<0>));
+    return sequence_c<N>([&](auto... is) { return safe_vec<type, N>{f(is)...}; });
+}
+
+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        if constexpr(vec_size<T>() == 0)
+        {
+            return x;
+        }
+        else
+        {
+            static_assert(vec_size<T>() == N, "Vector mismatch size");
+            return __builtin_convertvector(x, vec<U, N>);
+        }
+    }
+
+    template <class U>
+    constexpr operator U() const
+    {
+        return static_cast<U>(x);
+    }
+};
+
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
new file mode 100644
index 000000000..b456b5c6e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -0,0 +1,263 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+#define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
+
+namespace migraphx {
+
+template <class T>
+constexpr auto tensor_vec_size()
+{
+    return vec_size<typename T::type>();
+}
+
+template <class T>
+constexpr auto tensor_vec_size(T)
+{
+    return tensor_vec_size<T>();
+}
+
+template <index_int N, class Shape, class Axis>
+constexpr auto shape_step(Shape s, Axis)
+{
+    static_assert(N > 0, "Vector size must be non-zero");
+    return sequence(s.lens.size(), [&](auto... is) {
+        auto lens    = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
+            constexpr auto axis = Axis::to();
+            MIGRAPHX_ASSERT(i != 0);
+            MIGRAPHX_ASSERT(j != axis or i % N == 0);
+            if(j == axis)
+                return i / N;
+            else
+                return i;
+        });
+        auto strides = transform(s.strides, index_ints<is...>{}, [&](auto i, auto j) {
+            constexpr auto axis = Axis::to();
+            // If stride of the axis is zero then we dont need to adjust the other strides
+            if(Shape{}.strides[axis] == 0)
+                return i;
+            MIGRAPHX_ASSERT(j == axis or i % N == 0);
+            if(j == axis)
+                return i;
+            else
+                return i / N;
+        });
+        MIGRAPHX_ASSERT(make_shape(lens, strides).elements() * N == s.elements());
+        MIGRAPHX_ASSERT(strides[Axis{}] == 0 or
+                        make_shape(lens, strides).element_space() * N == s.element_space());
+        return make_shape(lens, strides);
+    });
+}
+
+template <index_int N, class T, class Axis>
+__device__ __host__ auto as_vec(T x, Axis axis)
+{
+    if constexpr(N < 2)
+        return x;
+    else
+        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
+                                shape_step<N>(x.get_shape(), axis));
+}
+
+template <index_int N, class T, class Axis>
+constexpr auto tensor_step(T x, Axis axis)
+{
+    if constexpr(N < 2)
+    {
+        return x;
+    }
+    else
+    {
+        constexpr auto s = decltype(x.get_shape()){};
+        MIGRAPHX_ASSERT(s.strides[axis] == 0);
+        return make_tensor_view(x.data(), shape_step<N>(s, axis));
+    }
+}
+
+template <class IntegralConstant, class T>
+__device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
+{
+    return as_vec<ic>(x);
+}
+
+template <class Shape>
+constexpr index_int find_vector_axis_c(Shape s)
+{
+    // Find the fastest axis that is not broadcasted
+    index_int axis = 0;
+    for(index_int i = 1; i < s.lens.size(); i++)
+    {
+        if(s.strides[i] == 0)
+            continue;
+        if(s.strides[axis] == 0 or
+           pack_compare(less{}, pack(s.strides[i], s.lens[i]), pack(s.strides[axis], s.lens[axis])))
+            axis = i;
+    }
+    return axis;
+}
+
+template <class... Shapes>
+constexpr index_int find_vector_axis_c(Shapes... ss)
+{
+    const bool all_broadcasted = (ss.broadcasted() and ...);
+    index_int axis             = 0;
+    bool b                     = false;
+    by([&](auto s) {
+        if(b)
+            return;
+        // Skip broadcasted shapes if there are shapes not broadcasted
+        if(not all_broadcasted and s.broadcasted())
+            return;
+        axis = find_vector_axis_c(s);
+        if(s.strides[axis] == 1)
+            b = true;
+    })(ss...);
+    if(not b)
+        return -1;
+    return axis;
+}
+
+template <class... Shapes>
+constexpr auto find_vector_axis(Shapes...)
+{
+    return _c<find_vector_axis_c(Shapes{}...)>;
+}
+
+template <index_int N, class Axis, class... Shapes>
+constexpr auto is_vectorizable_c(Axis axis, Shapes... ss)
+{
+    return ((axis < ss.lens.size() and ss.lens[axis] % N == 0 and
+             // Only vectorize broadcasted types with stride 0, since this causes issues in the
+             // preloader
+             ((not ss.broadcasted() and ss.strides[axis] == 1) or ss.strides[axis] == 0)) and
+            ...);
+}
+
+template <index_int N, class Axis, class... Shapes>
+constexpr auto is_vectorizable(Axis, Shapes...)
+{
+    return _c<is_vectorizable_c<N>(Axis::to(), Shapes{}...)>;
+}
+
+template <class P>
+constexpr auto find_vectorize_size(P pred)
+{
+    if constexpr(decltype(pred(_c<4>)){})
+        return _c<4>;
+    else if constexpr(decltype(pred(_c<2>)){})
+        return _c<2>;
+    else
+        return _c<1>;
+}
+
+template <class T>
+__host__ __device__ auto auto_vectorize(T x)
+{
+    if constexpr(tensor_vec_size<T>() == 0)
+    {
+        constexpr auto axis = find_vector_axis(x.get_shape());
+        constexpr auto n =
+            find_vectorize_size([&](auto i) { return is_vectorizable<i>(axis, x.get_shape()); });
+        return as_vec<n>(x, axis);
+    }
+    else
+    {
+        return x;
+    }
+}
+
+template <class F, class... Ts>
+inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
+{
+    // TODO: Just check there a single axis of 1
+    constexpr bool packed_or_broadcasted =
+        ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+    if constexpr(packed_or_broadcasted)
+    {
+        constexpr auto axis = decltype(find_vector_axis(xs.get_shape()...)){};
+        constexpr auto n    = find_vectorize_size(
+            [&](auto i) { return is_vectorizable<i>(axis, xs.get_shape()...); });
+        by(
+            [&](auto x) {
+                constexpr auto s = decltype(x.get_shape()){};
+                if constexpr(axis < s.strides.size())
+                {
+                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
+                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
+                    if constexpr(s.strides[axis] == 0)
+                        return tensor_step<n>(x, axis);
+                    else
+                        return as_vec<n>(x, axis);
+                }
+                else
+                {
+                    return x;
+                }
+            },
+            f)(xs...);
+    }
+    else
+    {
+        f(xs...);
+    }
+}
+
+inline __device__ __host__ auto auto_vectorize()
+{
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
+}
+
+template <index_int N, index_int Axis, class T>
+__device__ __host__ auto vectorize_tensor(T x)
+{
+    constexpr auto shape = get_shape_c<T>{};
+    if constexpr(shape.lens[Axis] == 1)
+        return x;
+    else if constexpr(shape.strides[Axis] == 0)
+        return tensor_step<N>(x, _c<Axis>);
+    else
+        return as_vec<N>(x, _c<Axis>);
+}
+
+template <index_int N, index_int Axis>
+__device__ __host__ auto vectorize()
+{
+    return make_transform([](auto f, auto... xs) {
+        if constexpr(N < 2)
+        {
+            f(xs...);
+        }
+        else
+        {
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
diff --git a/docker/rocm/migraphx/targets/gpu/logsoftmax.cpp b/docker/rocm/migraphx/targets/gpu/logsoftmax.cpp
new file mode 100644
index 000000000..63fc5eb5e
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/logsoftmax.cpp
@@ -0,0 +1,53 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/logsoftmax.hpp>
+#include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.normalize_compute_shape({inputs.at(0)});
+}
+
+argument
+hip_logsoftmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto n_dim      = args.front().get_shape().lens().size();
+    auto tuned_axis = tune_axis(n_dim, op.axis, op.name());
+    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), tuned_axis);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/loop.cpp b/docker/rocm/migraphx/targets/gpu/loop.cpp
new file mode 100644
index 000000000..ad5fc210c
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/loop.cpp
@@ -0,0 +1,126 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <cstdint>
+#include <migraphx/run_loop.hpp>
+#include <migraphx/gpu/loop.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/fill.hpp>
+#include <unordered_map>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
+{
+    auto input_num = (inputs.size() - 2) / 2;
+    inputs.erase(inputs.begin() + input_num, inputs.end());
+    return op.compute_shape(inputs, std::move(mods));
+}
+
+struct gpu_loop
+{
+    int64_t max_iterations = 0;
+
+    template <class T>
+    void copy(context& ctx, const argument& src, T& dst) const
+    {
+        argument arg_dst{src.get_shape(), &dst};
+        copy_from_gpu(ctx, src, arg_dst);
+    }
+
+    template <class T>
+    void copy(context& ctx, T src, const argument& dst) const
+    {
+        argument arg_src{dst.get_shape(), &src};
+        copy_to_gpu(ctx, arg_src, dst);
+    }
+
+    void append(const std::vector<argument>&,
+                const std::vector<argument>&,
+                const std::vector<int64_t>&,
+                int64_t,
+                int64_t) const
+    {
+    }
+
+    void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
+    {
+        if(iter >= max_iterations)
+            return;
+
+        auto elem_num = max_iterations - iter;
+        for(const auto& out : concatenated_outputs)
+        {
+            auto s    = out.get_shape();
+            auto size = s.bytes() / max_iterations;
+            auto lens = s.lens();
+            lens[0]   = elem_num;
+            shape ss{s.type(), lens};
+            assert(ss.bytes() + iter * size <= out.get_shape().bytes());
+            device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
+        }
+    }
+
+    std::unordered_map<std::string, int> get_output_params(const module& m) const
+    {
+        auto get_output_index = [](const std::string& name) {
+            std::string out_prefix = "#output_";
+            auto loc               = name.find(out_prefix);
+            if(loc != std::string::npos)
+            {
+                return std::stoi(name.substr(loc + out_prefix.size()));
+            }
+
+            return -1;
+        };
+
+        const auto& param_names = m.get_parameter_names();
+        std::unordered_map<std::string, int> result;
+        for(const auto& name : param_names)
+        {
+            auto index = get_output_index(name);
+            if(index == -1)
+                continue;
+            result[name] = index;
+        }
+
+        return result;
+    }
+};
+
+argument
+hip_loop::compute(context& ctx,
+                  const shape&,
+                  const std::vector<argument>& args,
+                  const std::vector<module_ref>& mods,
+                  const std::function<std::vector<argument>(
+                      module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+{
+    return run_loop(gpu_loop{op.max_iterations}, op.scan_output_directions, ctx, args, mods, run);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/lowering.cpp b/docker/rocm/migraphx/targets/gpu/lowering.cpp
new file mode 100644
index 000000000..adba54661
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/lowering.cpp
@@ -0,0 +1,599 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <iterator>
+#include <functional>
+#include <algorithm>
+
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/program.hpp>
+
+#include <migraphx/op/common.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/if_op.hpp>
+#include <migraphx/op/reshape.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/reshape_lazy.hpp>
+
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/gemm.hpp>
+#include <migraphx/gpu/hip_gemm.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/hipblaslt.hpp>
+#include <migraphx/gpu/compiler.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPBLASLT_GEMM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MIOPEN_POOLING)
+
+struct miopen_apply
+{
+    module* mod              = nullptr;
+    module_pass_manager* mpm = nullptr;
+    const lowering* pass     = nullptr;
+    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
+    instruction_ref last{};
+    bool offload_copy = false;
+    bool compute_fp32 = false;
+
+    context& get_context() const
+    {
+        assert(pass != nullptr);
+        assert(pass->ctx != nullptr);
+        return *pass->ctx;
+    }
+
+    void check_shape(shape x, instruction_ref i)
+    {
+        assert(x == i->get_shape());
+        (void)x;
+        (void)i;
+    }
+
+    void init()
+    {
+        assert(mod != nullptr);
+        assert(pass != nullptr);
+#if MIGRAPHX_USE_ROCBLAS
+        compute_fp32 = get_compute_fp32_flag();
+#endif
+        offload_copy = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
+
+        add_extend_op("argmax");
+        add_extend_op("argmin");
+        add_extend_op("logsoftmax");
+        add_extend_op("multinomial");
+        add_extend_op("nonzero");
+        add_extend_op("prefix_scan_sum");
+        add_extend_op("reverse");
+        add_extend_op("rnn_var_sl_last_output");
+        add_extend_op("rnn_var_sl_shift_output");
+        add_extend_op("rnn_var_sl_shift_sequence");
+        add_extend_op("topk");
+        add_generic_op("contiguous");
+        add_pooling_op();
+#if MIGRAPHX_USE_MIOPEN
+        add_convolution_op("convolution");
+        add_convolution_op("convolution_backwards");
+        add_convolution_op("quant_convolution");
+        add_extend_op("lrn");
+#endif
+#if MIGRAPHX_USE_ROCBLAS or MIGRAPHX_USE_HIPBLASLT
+        add_gemm_op<op::dot>("dot");
+        add_gemm_op<op::quant_dot>("quant_dot");
+#endif
+        add_if_op();
+        add_loop_op();
+        add_neg_op();
+        add_nms_op();
+        add_lrn_op();
+        add_convolution_backwards_op();
+        add_select_module_op();
+        add_reshape_lazy_op();
+        add_group_query_attention_op();
+        add_scan_slice_op();
+    }
+
+    void copy_params() const
+    {
+        if(not offload_copy)
+            return;
+
+        for(auto ins : iterator_for(*mod))
+        {
+            if(ins->name() != "@param")
+                continue;
+
+            // parameter no outputs, no need to insert copy to gpu
+            if(ins->outputs().empty())
+                continue;
+
+            auto pos = std::next(ins);
+            auto a   = insert_allocation(pos, ins->get_shape());
+            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
+            mod->replace_instruction(ins, c);
+        }
+
+        // return instruction
+        auto ret = std::prev(mod->end());
+        if(ret->name() == "@return")
+        {
+            const auto& inputs = ret->inputs();
+
+            // each input of ret need to be copied from gpu to host, and replace
+            // output with copy output
+            for(const auto& in : inputs)
+            {
+                auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
+                instruction::replace_argument(ret, in, p_output);
+            }
+        }
+        // else branch to handle legacy program without the return instruction
+        else
+        {
+            mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
+        }
+    }
+
+    void apply()
+    {
+        init();
+        for(auto it = mod->begin(); it != mod->end(); it++)
+        {
+            auto s     = it->get_shape();
+            auto attrs = it->get_operator().attributes();
+            if(apply_map.count(it->name()) > 0)
+            {
+                check_shape(s, apply_map.at(it->name())(it));
+            }
+            else if(has_compiler_for(it->name()))
+            {
+                check_shape(s, insert_precompile_op(it));
+            }
+            else if(attrs.contains("target"))
+            {
+                check_shape(s, insert_custom_op(it, attrs));
+            }
+            if(attrs.contains("prefill"))
+            {
+                insert_fill(it, attrs.at("prefill"));
+            }
+        }
+        copy_params();
+    }
+
+    void insert_fill(instruction_ref ins, value v) const
+    {
+        instruction_ref alloc = instruction::get_output_alias(ins, true);
+        if(alloc == ins)
+            return;
+        auto fill = mod->insert_instruction(ins, make_op("hip::fill", {{"value", v}}), alloc);
+        instruction::replace_argument(ins, alloc, fill);
+    }
+
+    instruction_ref insert_custom_op(instruction_ref ins, const value& attrs) const
+    {
+        const auto& custom_op = ins->get_operator();
+        if(attrs.at("target") == "cpu")
+        {
+            auto s = ins->get_shape();
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            auto output = inputs.back();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, custom_op, cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        }
+        return ins;
+    }
+
+    instruction_ref insert_precompile_op(instruction_ref ins) const
+    {
+        auto output                       = insert_allocation(ins, ins->get_shape());
+        std::vector<instruction_ref> refs = ins->inputs();
+        refs.push_back(output);
+
+        return mod->replace_instruction(
+            ins,
+            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+            refs,
+            ins->module_inputs());
+    }
+
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
+    {
+        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
+    }
+
+#if MIGRAPHX_USE_ROCBLAS or MIGRAPHX_USE_HIPBLASLT
+    template <typename Op>
+    void add_gemm_op(const std::string& name)
+    {
+        apply_map.emplace(name, [=](instruction_ref ins) {
+            std::vector<instruction_ref> refs = ins->inputs();
+            assert(refs.size() == 2);
+            auto output = insert_allocation(ins, ins->get_shape());
+            refs.push_back(output);
+#if MIGRAPHX_USE_HIPBLASLT
+            if(not enabled(MIGRAPHX_ENABLE_HIPBLASLT_GEMM{}) or not hipblaslt_supported())
+            {
+#endif
+                return mod->replace_instruction(
+                    ins, rocblas_gemm<Op>{Op{}, 1, 0, compute_fp32}, refs);
+#if MIGRAPHX_USE_HIPBLASLT
+            }
+            std::string op_name = "gpu::hip_gemm";
+            if(contains(name, "quant_"))
+            {
+                op_name = "gpu::hip_quant_gemm";
+            }
+            operation gemm_op = make_op(op_name);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::hipblaslt_op", {{"op", to_value(gemm_op)}}),
+                ins->inputs().at(0),
+                ins->inputs().at(1),
+                output);
+#endif
+        });
+    }
+#endif
+
+#if MIGRAPHX_USE_MIOPEN
+    void add_convolution_op(const std::string& name)
+    {
+        apply_map.emplace(name, [=](instruction_ref ins) {
+            operation conv = make_op("gpu::" + name, {{"op", ins->get_operator().to_value()}});
+            auto output    = insert_allocation(ins, ins->get_shape());
+
+            return mod->replace_instruction(ins,
+                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
+                                            ins->inputs().at(0),
+                                            ins->inputs().at(1),
+                                            output);
+        });
+    }
+#endif
+    // add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
+    // the fields Since it doesn't have fields its default constructed
+
+    void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }
+
+    void add_generic_op(const std::string& op_name, const std::string& gpu_name)
+    {
+        apply_map.emplace(op_name, [=](instruction_ref ins) {
+            auto output                       = insert_allocation(ins, ins->get_shape());
+            std::vector<instruction_ref> refs = ins->inputs();
+            refs.push_back(output);
+
+            return mod->replace_instruction(ins, make_op(gpu_name), refs);
+        });
+    }
+
+    void add_extend_op(const std::string& name) { add_extend_op(name, "gpu::" + name); }
+
+    void add_extend_op(const std::string& op_name, const std::string& gpu_name)
+    {
+        apply_map.emplace(op_name, [=](instruction_ref ins) {
+            auto&& op                         = ins->get_operator();
+            auto output                       = insert_allocation(ins, ins->get_shape());
+            std::vector<instruction_ref> refs = ins->inputs();
+            refs.push_back(output);
+
+            return mod->replace_instruction(ins, make_op(gpu_name, op.to_value()), refs);
+        });
+    }
+
+    static bool use_miopen_pooling(instruction_ref ins)
+    {
+        if(enabled(MIGRAPHX_DISABLE_MIOPEN_POOLING{}) or
+           not contains({shape::float_type, shape::half_type}, ins->get_shape().type()))
+            return false;
+        auto&& op   = ins->get_operator();
+        auto op_val = op.to_value();
+        auto mode   = op_val.at("mode").to<op::pooling_mode>();
+        if(op_val.at("count_include_pad").to<bool>() and mode == op::pooling_mode::average)
+            return false;
+        if(mode == op::pooling_mode::lpnorm)
+            return false;
+        auto op_padding = op_val.at("padding").to_vector<size_t>();
+        auto kdims      = ins->get_shape().lens().size() - 2;
+        return std::equal(op_padding.begin(),
+                          op_padding.begin() + kdims,
+                          op_padding.begin() + kdims,
+                          op_padding.end());
+    }
+
+    void add_pooling_op()
+    {
+        apply_map.emplace("pooling", [=](instruction_ref ins) {
+            if(not use_miopen_pooling(ins))
+                return insert_precompile_op(ins);
+#if MIGRAPHX_USE_MIOPEN
+            auto output                       = insert_allocation(ins, ins->get_shape());
+            std::vector<instruction_ref> refs = ins->inputs();
+            auto&& op                         = ins->get_operator();
+            refs.push_back(output);
+            return mod->replace_instruction(ins, make_op("gpu::pooling", op.to_value()), refs);
+#else 
+            return insert_precompile_op(ins);
+#endif
+        });
+    }
+
+    // use 0 - input to represent neg
+    void add_neg_op()
+    {
+        apply_map.emplace("neg", [=](instruction_ref ins) {
+            auto s = ins->get_shape();
+            std::vector<float> zeros(s.elements(), 0.0f);
+            auto l0     = mod->add_literal(literal(s, zeros));
+            auto output = insert_allocation(ins, s);
+            return mod->replace_instruction(
+                ins, make_op("gpu::sub"), l0, ins->inputs().front(), output);
+        });
+    }
+
+    // add input and output argument for the if operator
+    void add_if_op()
+    {
+        apply_map.emplace("if", [=](instruction_ref ins) {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            auto cpu_cond =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
+            auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
+            inputs.front() = sync_cond;
+
+            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
+        });
+    }
+
+    // replace the loop operator with gpu_loop operator
+    void add_loop_op()
+    {
+        apply_map.emplace("loop", [=](instruction_ref ins) {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            // copy max_iter from gpu to cpu
+            auto cpu_max_iter =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
+            auto cpu_cond =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
+            auto synced_max_iter =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
+            inputs.at(0)     = synced_max_iter;
+            inputs.at(1)     = cpu_cond;
+            auto copy_inputs = inputs;
+            std::transform(copy_inputs.begin(),
+                           copy_inputs.end(),
+                           std::back_inserter(inputs),
+                           [&](auto in) { return insert_allocation(ins, in->get_shape()); });
+
+            auto mod_args = ins->module_inputs();
+            auto output   = insert_allocation(ins, ins->get_shape());
+
+            const auto* sub_mod = mod_args.front();
+            auto cond_out       = insert_allocation(ins, sub_mod->get_output_shapes().front());
+
+            // add cond and mod outputs to the argument list
+            inputs.push_back(cond_out);
+            inputs.push_back(output);
+
+            return mod->replace_instruction(
+                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
+        });
+    }
+
+    void add_nms_op()
+    {
+        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
+
+    void add_lrn_op()
+    {
+        apply_map.emplace("lrn", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
+
+    void add_convolution_backwards_op()
+    {
+        apply_map.emplace("convolution_backwards", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
+
+    /**
+     * Adds dynamic allocation for submodule output parameter.
+     */
+    void add_select_module_op()
+    {
+        apply_map.emplace("select_module", [=](instruction_ref ins) {
+            auto s                              = ins->get_shape();
+            auto output                         = insert_allocation(ins, s);
+            std::vector<instruction_ref> inputs = ins->inputs();
+            inputs.push_back(output);
+            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
+        });
+    }
+
+    /**
+     *  Adds reshape lazy to reshape ops that can be aliased instead of copied.
+     *  `gpu::contiguous` are added before and after the reshape; these contiguous
+     *  instructions can be removed by the eliminate_contiguous pass.
+     */
+    void add_reshape_lazy_op()
+    {
+        apply_map.emplace("reshape", [=](instruction_ref ins) {
+            std::vector<instruction_ref> before_contiguous_args = ins->inputs();
+            auto before_alloc = insert_allocation(ins, std::prev(ins)->get_shape());
+            before_contiguous_args.push_back(before_alloc);
+            auto before_contig =
+                mod->insert_instruction(ins, make_op("gpu::contiguous"), {before_contiguous_args});
+
+            auto new_lazy_reshape = mod->insert_instruction(
+                ins,
+                make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value().at("dims")}}}),
+                before_contig);
+
+            std::vector<instruction_ref> after_contiguous_args = {new_lazy_reshape};
+            auto after_alloc = insert_allocation(new_lazy_reshape, new_lazy_reshape->get_shape());
+            after_contiguous_args.push_back(after_alloc);
+            return mod->replace_instruction(ins, make_op("gpu::contiguous"), after_contiguous_args);
+        });
+    }
+
+    void add_group_query_attention_op()
+    {
+        apply_map.emplace("gpu::gqa_rotary_embedding", [=](instruction_ref ins) {
+            auto s          = ins->get_shape();
+            auto output     = insert_allocation(ins, s);
+            auto new_inputs = ins->inputs();
+            new_inputs.push_back(output);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                new_inputs);
+        });
+
+        apply_map.emplace("gpu::concat_past_present", [=](instruction_ref ins) {
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                ins->inputs());
+        });
+
+        apply_map.emplace("gpu::compute_attention_probabilities", [=](instruction_ref ins) {
+            auto s          = ins->get_shape();
+            auto output     = insert_allocation(ins, s);
+            auto new_inputs = ins->inputs();
+            new_inputs.push_back(output);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                new_inputs);
+        });
+
+        apply_map.emplace("gpu::gqa_softmax", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto inputs = ins->inputs();
+
+            auto new_inputs = ins->inputs();
+            new_inputs.push_back(inputs.at(2));
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                new_inputs);
+        });
+
+        apply_map.emplace("gpu::compute_attention_scores", [=](instruction_ref ins) {
+            auto s          = ins->get_shape();
+            auto output     = insert_allocation(ins, s);
+            auto new_inputs = ins->inputs();
+            new_inputs.push_back(output);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                new_inputs);
+        });
+    }
+
+    void add_scan_slice_op()
+    {
+        apply_map.emplace("scan_slice", [=](instruction_ref ins) {
+            auto inputs  = ins->inputs();
+            auto cpu_idx = mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs[1]);
+            inputs[1]    = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_idx);
+            return mod->replace_instruction(
+                ins, mod->insert_instruction(ins, ins->get_operator(), inputs));
+        });
+    }
+};
+
+void lowering::apply(module_pass_manager& mpm) const
+{
+    miopen_apply{&mpm.get_module(), &mpm, this}.apply();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/lrn.cpp b/docker/rocm/migraphx/targets/gpu/lrn.cpp
new file mode 100644
index 000000000..2e99c208d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/lrn.cpp
@@ -0,0 +1,66 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/lrn.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_MIOPEN
+shape miopen_lrn::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    return inputs.at(1);
+}
+
+argument miopen_lrn::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    float alpha = 1;
+    float beta  = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenLRNForward(ctx.get_stream().get_miopen(),
+                     ldesc.get(),
+                     &alpha,
+                     x_desc.get(),
+                     args[0].implicit(),
+                     &beta,
+                     y_desc.get(),
+                     args[1].implicit(),
+                     false,
+                     nullptr);
+
+    return args[1];
+}
+
+void miopen_lrn::finalize(context&, const shape&, const std::vector<shape>&)
+{
+    ldesc = make_lrn(op);
+}
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/mlir.cpp b/docker/rocm/migraphx/targets/gpu/mlir.cpp
new file mode 100644
index 000000000..61e0325ac
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/mlir.cpp
@@ -0,0 +1,1300 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <algorithm>
+#include <cstdint>
+#include <migraphx/shape.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <mlir-c/Dialect/RockEnums.h>
+#include <numeric>
+#include <ostream>
+
+#ifdef MIGRAPHX_MLIR
+#include <mlir-c/IR.h>
+#include <mlir-c/BuiltinAttributes.h>
+#include <mlir-c/BuiltinTypes.h>
+#include <mlir-c/Diagnostics.h>
+#include <mlir-c/Dialect/MIGraphX.h>
+#include <mlir-c/Dialect/Rock.h>
+#include <mlir-c/IntegerSet.h>
+#include <mlir-c/Pass.h>
+#include <mlir-c/Support.h>
+#include <mutex>
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 4
+#warning "Incompatible version of rocMLIR library used, disabling"
+// Only undefine when not using cppcheck
+#ifndef CPPCHECK
+#undef MIGRAPHX_MLIR
+#endif
+#else
+#include <mlir-c/RegisterRocMLIR.h>
+#endif
+#endif
+
+#include <migraphx/env.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/load_save.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <deque>
+#include <variant>
+#include <fstream>
+#include <sstream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNE_EXHAUSTIVE);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNE_LIMIT);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_DB);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_CFG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_ENABLE_SPLITK);
+
+#ifdef MIGRAPHX_MLIR
+template <class T, class F, F f> // NOLINT
+struct mlir_handle
+{
+    struct ptr
+    {
+        ptr() = default;
+        ptr(std::nullptr_t) {}
+        ptr(T x) : obj(x) {}
+
+        std::intptr_t get_value() const
+        {
+            static_assert(sizeof(T) == sizeof(std::intptr_t), "MLIR Handle different size");
+            return reinterpret_cast<const std::intptr_t&>(obj);
+        }
+
+        T get() const { return obj; }
+
+        friend bool operator==(ptr x, ptr y) { return x.get_value() == y.get_value(); }
+
+        friend bool operator!=(ptr x, ptr y) { return not(x == y); }
+
+        explicit operator bool() const noexcept { return obj != ptr(); }
+        T obj{};
+    };
+
+    struct deleter
+    {
+        using pointer = ptr;
+
+        void operator()(pointer x) const
+        {
+            if(x != nullptr)
+            {
+                (void)f(x.obj);
+            }
+        }
+    };
+
+    mlir_handle() : handle(nullptr) {}
+
+    mlir_handle(T p) : handle(ptr{p}) {}
+
+    T get() const
+    {
+        return handle.get().get(); // NOLINT(readability-redundant-smartptr-get)
+    }
+
+    T release() { return handle.release().get(); }
+
+    private:
+    std::unique_ptr<ptr, deleter> handle;
+};
+
+#define MIGRAPHX_MANAGE_MLIR_HANDLE(T, F) migraphx::gpu::mlir_handle<T, decltype(&F), &F> // NOLINT
+
+using mlir_context     = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
+using mlir_thread_pool = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirLlvmThreadPool, mlirLlvmThreadPoolDestroy);
+using mlir_dialect_registry  = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirDialectRegistry,
+                                                          mlirDialectRegistryDestroy);
+using mlir_module            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirModule, mlirModuleDestroy);
+using mlir_operation         = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOperation, mlirOperationDestroy);
+using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
+                                                           mlirOpPrintingFlagsDestroy);
+using mlir_region            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy);
+using mlir_block             = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy);
+using mlir_pass_manager      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
+using mlir_tuning_table      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
+                                                      mlirRockTuningTableDestroy);
+using mlir_tuning_space      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningSpace,
+                                                      mlirRockTuningSpaceDestroy);
+using mlir_tuning_param      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningParam,
+                                                      mlirRockTuningParamDestroy);
+
+std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }
+
+MlirStringRef make_mlir_string_ref(const std::string_view& s)
+{
+    return mlirStringRefCreate(s.data(), s.size());
+}
+
+template <class F, class T, class Printer>
+void mlir_print(F f, T x, Printer printer)
+{
+    f(
+        x,
+        +[](MlirStringRef s, void* data) {
+            (*reinterpret_cast<Printer*>(data))(to_string_view(s));
+        },
+        &printer);
+}
+
+template <class F, class T>
+void mlir_print(F f, T x, std::ostream& os)
+{
+    mlir_print(f, x, [&](auto s) { os << s; });
+}
+
+template <class F, class T>
+std::string mlir_print(F f, T x)
+{
+    std::stringstream ss;
+    mlir_print(f, x, [&](auto s) { ss << s; });
+    return ss.str();
+}
+
+struct mlir_logger
+{
+    std::stringstream ss;
+    mlir_context* ctx;
+    std::optional<MlirDiagnosticHandlerID> id;
+
+    mlir_logger() : ctx(nullptr), id(std::nullopt) {}
+
+    mlir_logger(mlir_context* context) : ctx(context)
+    {
+        id =
+            mlirContextAttachDiagnosticHandler(ctx->get(), mlir_diagnostic_print_cb, this, nullptr);
+    }
+
+    ~mlir_logger()
+    {
+        if(id.has_value())
+            mlirContextDetachDiagnosticHandler(ctx->get(), *id);
+    }
+
+    mlir_logger(const mlir_logger& other)            = delete;
+    mlir_logger& operator=(const mlir_logger& other) = delete;
+
+    mlir_logger(mlir_logger&& other) noexcept
+        : ss(std::move(other.ss)), ctx(other.ctx), id(other.id)
+    {
+        other.ctx = nullptr;
+        other.id  = std::nullopt;
+    }
+
+    mlir_logger& operator=(mlir_logger other) noexcept
+    {
+        std::swap(ss, other.ss);
+        std::swap(ctx, other.ctx);
+        std::swap(id, other.id);
+        return *this;
+    }
+
+    std::string str() const { return ss.str(); }
+
+    void clear() { ss = std::stringstream{}; }
+
+    static MlirLogicalResult mlir_diagnostic_print_cb(MlirDiagnostic diag, void* logger);
+
+    MlirLogicalResult handle(MlirDiagnostic diag);
+};
+
+MlirLogicalResult mlir_logger::mlir_diagnostic_print_cb(MlirDiagnostic diag, void* logger)
+{
+    return reinterpret_cast<mlir_logger*>(logger)->handle(diag);
+}
+
+MlirLogicalResult mlir_logger::handle(MlirDiagnostic diag)
+{
+    MlirDiagnosticSeverity sev = mlirDiagnosticGetSeverity(diag);
+    switch(sev)
+    {
+    case MlirDiagnosticSeverity::MlirDiagnosticError: ss << "Error: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticWarning: ss << "Warning: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticNote: ss << "Note: "; break;
+    case MlirDiagnosticSeverity::MlirDiagnosticRemark: ss << "Remark: "; break;
+    }
+    mlir_print(mlirDiagnosticPrint, diag, [&](auto s) { ss << s; });
+    ss << std::endl;
+    for(intptr_t i = 0, e = mlirDiagnosticGetNumNotes(diag); i < e; ++i)
+    {
+        (void)handle(mlirDiagnosticGetNote(diag, i));
+    }
+    return mlirLogicalResultSuccess();
+}
+
+struct mlir_program
+{
+    mlir_program()
+        : ctx(mlirContextCreateWithRegistry(get_dialect_registry().get(),
+                                            /*threadingEnable=*/false)),
+          location(mlirLocationUnknownGet(ctx.get())),
+          mmodule(mlirModuleCreateEmpty(location)),
+          logger(&ctx)
+    {
+        mlirContextSetThreadPool(ctx.get(), get_thread_pool().get());
+        mlirContextLoadAllAvailableDialects(ctx.get());
+    }
+
+    static mlir_dialect_registry& get_dialect_registry()
+    {
+        static std::once_flag init_guard;
+        static mlir_dialect_registry the_registry;
+        // The MLIR registration functions (for dialects and passes) are not
+        // necessarily thread-safe and need to be executed exactly once
+        // (especially since they eventually call non-thread-safe LLVM
+        // initilizations).
+        std::call_once(init_guard, [&]() {
+            the_registry = mlirDialectRegistryCreate();
+            mlirRegisterRocMLIRDialects(the_registry.get());
+            mlirRegisterRocMLIRPasses();
+        });
+        return the_registry;
+    }
+
+    static mlir_thread_pool& get_thread_pool()
+    {
+        // To save on overhead, we create one LLVM thread pool and reuse it
+        // across all MLIR contexts as recommended by MLIR upstream.
+        // Note that this is thread-safe as of C++11.
+        static mlir_thread_pool the_pool = mlirLlvmThreadPoolCreate();
+        return the_pool;
+    }
+
+    MlirType make_type(shape::type_t t) const
+    {
+        MlirType result;
+        shape::visit(t, [&](auto as) {
+            if(as.type_enum() == shape::float_type)
+                result = mlirF32TypeGet(ctx.get());
+            else if(as.type_enum() == shape::half_type)
+                result = mlirF16TypeGet(ctx.get());
+            else if(as.type_enum() == shape::bf16_type)
+                result = mlirBF16TypeGet(ctx.get());
+            else if(as.type_enum() == shape::fp8e4m3fnuz_type)
+                result = mlirFloat8E4M3FNUZTypeGet(ctx.get());
+            else if(as.type_enum() == shape::fp8e5m2fnuz_type)
+                result = mlirFloat8E5M2FNUZTypeGet(ctx.get());
+            else if(as.type_enum() == shape::fp8e4m3fn_type)
+                result = mlirFloat8E4M3FNTypeGet(ctx.get());
+            else if(as.type_enum() == shape::fp8e5m2_type)
+                result = mlirFloat8E5M2TypeGet(ctx.get());
+            else if(as.type_enum() == shape::double_type)
+                result = mlirF64TypeGet(ctx.get());
+            else if(as.is_integral())
+            {
+                if(as.is_unsigned())
+                {
+                    result = mlirIntegerTypeUnsignedGet(ctx.get(), as.size() * 8);
+                }
+                else
+                {
+                    result = mlirIntegerTypeSignedGet(ctx.get(), as.size() * 8);
+                }
+            }
+            else
+                MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
+        });
+        return result;
+    }
+
+    MlirType make_mlir_shaped(const shape& s) const
+    {
+        if(s.dynamic())
+            MIGRAPHX_THROW("MLIR does not support dynamic shapes");
+        std::vector<int64_t> lens(s.lens().begin(), s.lens().end());
+        std::vector<int64_t> strides(s.strides().begin(), s.strides().end());
+        return rocmlirMIXRShapedTypeGet(
+            lens.size(), lens.data(), strides.data(), make_type(s.type()));
+    }
+
+    template <class Range>
+    std::vector<MlirType> make_mlir_shapeds(const Range& r)
+    {
+        std::vector<MlirType> result;
+        std::transform(r.begin(), r.end(), std::back_inserter(result), [&](const auto& s) {
+            return make_mlir_shaped(s);
+        });
+        return result;
+    }
+
+    MlirType make_function_type(const std::vector<shape>& inputs, const std::vector<shape>& outputs)
+    {
+        auto in  = make_mlir_shapeds(inputs);
+        auto out = make_mlir_shapeds(outputs);
+        return mlirFunctionTypeGet(ctx.get(), in.size(), in.data(), out.size(), out.data());
+    }
+
+    MlirIdentifier id(const std::string_view& s) const
+    {
+        return mlirIdentifierGet(ctx.get(), make_mlir_string_ref(s));
+    }
+
+    MlirAttribute attribute(std::int64_t i) const
+    {
+        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
+    }
+    MlirAttribute attribute(std::uint64_t i) const
+    {
+        if(i > (std::numeric_limits<std::uint64_t>::max() / 2))
+            MIGRAPHX_THROW("MLIR cant handle large integer values since they are ambiguous");
+        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
+    }
+    MlirAttribute attribute(unsigned char i) const { return attribute(std::uint64_t(i)); }
+    MlirAttribute attribute(bool b) const { return mlirBoolAttrGet(ctx.get(), b ? 1 : 0); }
+    MlirAttribute attribute(double d) const
+    {
+        return mlirFloatAttrDoubleGet(ctx.get(), mlirF64TypeGet(ctx.get()), d);
+    }
+    MlirAttribute attribute(const std::string& s) const
+    {
+        return mlirStringAttrGet(ctx.get(), make_mlir_string_ref(s));
+    }
+    MlirAttribute attribute(std::nullptr_t) const { return {}; }
+    template <class T>
+    MlirAttribute attribute(const std::vector<T>& v) const
+    {
+        std::vector<MlirAttribute> attributes;
+        attributes.reserve(v.size());
+        std::transform(v.begin(), v.end(), std::back_inserter(attributes), [&](auto&& x) {
+            return attribute(x);
+        });
+        return mlirArrayAttrGet(ctx.get(), attributes.size(), attributes.data());
+    }
+    MlirAttribute attribute(const value& v) const
+    {
+        MlirAttribute attr;
+        v.visit_value([&](auto&& x) { attr = attribute(x); });
+        return attr;
+    }
+    MlirAttribute attribute(const std::vector<value>& v) const
+    {
+        if(v.empty())
+        {
+            return mlirArrayAttrGet(ctx.get(), 0, nullptr);
+        }
+        if(not v.front().get_key().empty())
+        {
+            std::vector<MlirNamedAttribute> attributes = name_attributes(v);
+            return mlirDictionaryAttrGet(ctx.get(), attributes.size(), attributes.data());
+        }
+        else
+        {
+            std::vector<MlirAttribute> attributes;
+            attributes.reserve(v.size());
+            std::transform(v.begin(), v.end(), std::back_inserter(attributes), [&](auto&& x) {
+                return attribute(x);
+            });
+            return mlirArrayAttrGet(ctx.get(), attributes.size(), attributes.data());
+        }
+    }
+
+    MlirAttribute attribute(MlirType t) const { return mlirTypeAttrGet(t); }
+
+    MlirAttribute attribute(MlirAttribute a) const { return a; }
+
+    template <class T>
+    MlirNamedAttribute name_attribute(const std::string_view& key, const T& x) const
+    {
+        MlirNamedAttribute attr;
+        attr.name      = id(key);
+        attr.attribute = attribute(x);
+        return attr;
+    }
+
+    using attribute_t       = std::variant<std::nullptr_t,
+                                           std::uint64_t,
+                                           unsigned char,
+                                           bool,
+                                           double,
+                                           std::string,
+                                           value,
+                                           std::vector<value>,
+                                           MlirType,
+                                           MlirAttribute>;
+    using named_attribute_t = std::pair<std::string_view, attribute_t>;
+
+    MlirNamedAttribute name_attribute(const named_attribute_t& na) const
+    {
+        return name_attribute(na.first,
+                              std::visit([&](const auto& x) { return attribute(x); }, na.second));
+    }
+
+    std::vector<MlirNamedAttribute>
+    name_attributes(const std::vector<named_attribute_t>& named_attrs) const
+    {
+        std::vector<MlirNamedAttribute> attributes;
+        attributes.reserve(named_attrs.size());
+        std::transform(named_attrs.begin(),
+                       named_attrs.end(),
+                       std::back_inserter(attributes),
+                       [&](const named_attribute_t& a) { return name_attribute(a); });
+        return attributes;
+    }
+
+    std::vector<MlirNamedAttribute> name_attributes(const value& v) const
+    {
+        std::vector<MlirNamedAttribute> attributes;
+        attributes.reserve(v.size());
+        migraphx::transform_if(
+            v.begin(),
+            v.end(),
+            std::back_inserter(attributes),
+            [&](const value& x) { return not x.is_null(); },
+            [&](const value& x) { return name_attribute(x.get_key(), x.without_key()); });
+        return attributes;
+    }
+
+    struct mlir_operation_state
+    {
+        mlir_operation_state(mlir_program& p, const std::string_view& name)
+            : prog(&p), op_state(mlirOperationStateGet(make_mlir_string_ref(name), p.location))
+        {
+        }
+
+        mlir_operation_state& add_attributes(const std::vector<named_attribute_t>& named_attrs)
+        {
+            auto attributes = prog->name_attributes(named_attrs);
+            if(not attributes.empty())
+            {
+                mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            }
+            return *this;
+        }
+
+        mlir_operation_state& add_attribute_value(const value& v)
+        {
+            auto attributes = prog->name_attributes(v);
+            if(not attributes.empty())
+            {
+                mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            }
+            return *this;
+        }
+
+        mlir_operation_state& add_regions(std::vector<mlir_region> rs)
+        {
+            regions = std::move(rs);
+            return *this;
+        }
+
+        mlir_operation_state& add_region(mlir_region r)
+        {
+            regions.emplace_back(std::move(r));
+            return *this;
+        }
+
+        mlir_operation_state& add_results(const std::vector<shape>& outputs)
+        {
+            auto x = prog->make_mlir_shapeds(outputs);
+            if(not x.empty())
+            {
+                mlirOperationStateAddResults(&op_state, x.size(), x.data());
+            }
+            return *this;
+        }
+
+        mlir_operation_state& add_operands(const std::vector<MlirValue>& inputs)
+        {
+            if(not inputs.empty())
+            {
+                mlirOperationStateAddOperands(&op_state, inputs.size(), inputs.data());
+            }
+            return *this;
+        }
+
+        mlir_operation create_operation()
+        {
+            std::vector<MlirRegion> mregions(regions.size());
+            std::transform(regions.begin(), regions.end(), mregions.begin(), [](const auto& r) {
+                return r.get();
+            });
+            if(not mregions.empty())
+            {
+                mlirOperationStateAddOwnedRegions(&op_state, mregions.size(), mregions.data());
+            }
+            mlir_operation op(mlirOperationCreate(&op_state));
+            // Release memory since mlir_operation owns it
+            for(auto& r : regions)
+                r.release();
+            regions.clear();
+            return op;
+        }
+
+        mlir_program* prog;
+        MlirOperationState op_state;
+        std::vector<mlir_region> regions = {};
+    };
+
+    mlir_operation_state create_operation_state(const std::string_view& name)
+    {
+        return {*this, name};
+    }
+
+    std::vector<MlirValue> insert(MlirBlock body, mlir_operation_state ops)
+    {
+        std::vector<MlirValue> result;
+        mlir_operation op = ops.create_operation();
+        auto weak_op      = op.get();
+        mlirBlockAppendOwnedOperation(body, op.release());
+
+        auto n = mlirOperationGetNumResults(weak_op);
+        result.reserve(n);
+        transform(range(n), std::back_inserter(result), [&](auto i) {
+            return mlirOperationGetResult(weak_op, i);
+        });
+        return result;
+    }
+
+    MlirBlock
+    insert(MlirBlock body, const module& m, std::unordered_map<instruction_ref, MlirValue>& ins_map)
+    {
+        auto names = m.get_parameter_names();
+        std::sort(names.begin(), names.end());
+        std::vector<shape> inputs;
+        std::transform(names.begin(),
+                       names.end(),
+                       std::back_inserter(inputs),
+                       [&](const std::string& name) { return m.get_parameter_shape(name); });
+        std::vector<shape> outputs = m.get_output_shapes();
+
+        std::vector<MlirLocation> arg_locs(inputs.size(), location);
+        auto body_inputs   = make_mlir_shapeds(inputs);
+        mlir_region region = mlirRegionCreate();
+        mlir_block fbody = mlirBlockCreate(body_inputs.size(), body_inputs.data(), arg_locs.data());
+        MlirBlock result = fbody.get();
+        mlirRegionAppendOwnedBlock(region.get(), fbody.release());
+
+        auto ops = create_operation_state("func.func");
+        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
+                            {"sym_name", sym_name},
+                            {"kernel", std::string("mixr")},
+                            {"arch", target_arch},
+                            {"num_cu", num_cu}});
+        if(enabled(MIGRAPHX_MLIR_ENABLE_SPLITK{}))
+        {
+            ops.add_attributes({{"enable_splitk_for_tuning", mlirUnitAttrGet(ctx.get())}});
+        }
+        ops.add_region(std::move(region));
+        insert(body, std::move(ops));
+
+        for(auto i : range(names.size()))
+            ins_map[m.get_parameter(names[i])] = mlirBlockGetArgument(result, i);
+        return result;
+    }
+
+    static bool is_reshape(const std::string& name)
+    {
+        return contains({"reshape", "lazy_reshape", "squeeze", "unsqueeze", "flatten"}, name);
+    }
+
+    static std::string get_name(instruction_ref ins)
+    {
+        if(ins->name() == "@return")
+            return "func.return";
+        if(ins->name() == "@literal")
+            return "migraphx.literal";
+        if(ins->name() == "unpack_int4")
+            return "migraphx.unpack";
+        if(is_reshape(ins->name()))
+            return "migraphx.reshape";
+        return "migraphx." + ins->name();
+    }
+
+    static value get_operator_value(instruction_ref ins)
+    {
+        const operation& op = ins->get_operator();
+        auto v              = op.to_value();
+
+        // Reshape operator can have dim 0 or -1.
+        // Avoid passing those on to MLIR:
+        if(is_reshape(op.name()))
+            v = {{"dims", ins->get_shape().lens()}};
+
+        if(op.name() == "convolution" or op.name() == "quant_convolution")
+        {
+            // Adjust symetrical padding
+            if(v.at("padding").size() == v.at("stride").size())
+            {
+                auto padding = v.at("padding");
+                std::copy(padding.begin(), padding.end(), std::back_inserter(v.at("padding")));
+            }
+        }
+
+        if(op.name() == "unpack_int4")
+            v["axis"] = ins->get_shape().ndim() - 1;
+
+        return v;
+    }
+
+    static shape get_shape(instruction_ref ins)
+    {
+        if(ins->name() == "@return")
+        {
+            assert(ins->inputs().size() == 1);
+            return ins->inputs().front()->get_shape();
+        }
+        return ins->get_shape();
+    }
+
+    static std::string get_symbol_name(const module& m)
+    {
+        return "mlir_" + gen::generate_name_from_ops(m);
+    }
+
+    static void validate(const module& m)
+    {
+        if(m.begin() == m.end())
+            MIGRAPHX_THROW("Empty module");
+        auto last = std::prev(m.end());
+        if(last->name() != "@return")
+            MIGRAPHX_THROW("Missing @return as last instruction.");
+    }
+
+    void parse(const module& m)
+    {
+        validate(m);
+        sym_name   = get_symbol_name(m);
+        auto mbody = mlirModuleGetBody(mmodule.get());
+        std::unordered_map<instruction_ref, MlirValue> ins_map;
+        auto fbody = insert(mbody, m, ins_map);
+
+        for(auto ins : iterator_for(m))
+        {
+            if(ins->name() == "@param")
+                continue;
+            if(ins->name() == "contiguous")
+            {
+                ins_map[ins] = ins_map[ins->inputs().at(0)];
+                continue;
+            }
+            auto name = get_name(ins);
+            auto ops  = create_operation_state(name);
+            ops.add_attribute_value(get_operator_value(ins));
+            if(ins->name() != "@return")
+                ops.add_results({get_shape(ins)});
+
+            if(ins->name() == "@literal")
+            {
+                literal r = ins->get_literal();
+                auto sh   = ins->get_shape();
+
+                MlirType shaped_type = make_mlir_shaped(sh);
+                MlirType tensor_type = rocmlirMIXRShapedTypeAsTensor(shaped_type);
+                MlirAttribute mlir_value_attr =
+                    mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
+                ops.add_attributes({{"value", mlir_value_attr}});
+            }
+
+            if(ins->name() == "convolution" or ins->name() == "dot")
+            {
+                pp =
+                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
+            }
+
+            std::vector<MlirValue> inputs;
+            transform(
+                ins->inputs(), std::back_inserter(inputs), [&](auto i) { return ins_map.at(i); });
+            ops.add_operands(inputs);
+
+            auto outputs = insert(fbody, std::move(ops));
+            if(ins->name() != "@return")
+            {
+                assert(outputs.size() == 1);
+                ins_map[ins] = outputs.front();
+            }
+        }
+    }
+
+    void run_high_level_pipeline()
+    {
+        mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
+        mlirMIGraphXAddHighLevelPipeline(pm_front.get());
+        logger.clear();
+        if(mlirLogicalResultIsFailure(
+               mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()))))
+        {
+            std::string error = "Invalid MLIR created: " + logger.str();
+            if(enabled(MIGRAPHX_TRACE_MLIR{}))
+            {
+                std::cout << error << std::endl;
+            }
+            MIGRAPHX_THROW(error);
+        }
+    }
+
+    void run_backend_pipeline()
+    {
+        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
+        mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
+        logger.clear();
+        const size_t trace = value_of(MIGRAPHX_TRACE_MLIR{});
+        static std::mutex mutex;
+        auto mod_op = mlirModuleGetOperation(mmodule.get());
+        if(trace >= 2)
+        {
+            const std::lock_guard<std::mutex> lock(mutex);
+            std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+        }
+
+        if(mlirLogicalResultIsFailure(mlirPassManagerRunOnOp(pm_back.get(), mod_op)))
+        {
+            std::string error = "MLIR backend compilation failed: " + logger.str();
+            if(enabled(MIGRAPHX_TRACE_MLIR{}))
+            {
+                std::cout << error << std::endl;
+            }
+            MIGRAPHX_THROW(error);
+        }
+    }
+
+    code_object_op compile(const value& solution)
+    {
+        // 1st pipeline to call
+        run_high_level_pipeline();
+        if(solution.is_null())
+            get_module_tuned();
+        else
+            set_tuning(solution);
+        // 2nd pipeline to call
+        run_backend_pipeline();
+
+        code_object_op op{};
+        op.symbol_name                = sym_name;
+        op.code_object                = get_binary();
+        std::tie(op.global, op.local) = get_launch_params();
+        return op;
+    }
+
+    void set_gpu_properties(const context& migraphx_ctx)
+    {
+        const auto& device = migraphx_ctx.get_current_device();
+        target_arch        = device.get_device_name();
+        num_cu             = device.get_cu_count();
+    }
+
+    std::pair<std::size_t, std::size_t> get_launch_params() const
+    {
+        uint32_t attrs[2];
+        // returns block and grid sizes
+        mlirGetKernelAttrs(mmodule.get(), attrs);
+        std::size_t local  = attrs[0];
+        std::size_t global = local * attrs[1];
+        return {global, local};
+    }
+
+    value::binary get_binary() const
+    {
+        size_t size = 0;
+        mlirGetBinary(mmodule.get(), &size, nullptr);
+        value::binary result(size);
+        if(mlirGetBinary(mmodule.get(), &size, reinterpret_cast<char*>(result.data())))
+            return result;
+        MIGRAPHX_THROW("Failed to compile mlir program");
+    }
+
+    void set_tuning(const value& v) MIGRAPHX_TIDY_CONST
+    {
+        const auto* str = v.if_string();
+        if(str == nullptr)
+            MIGRAPHX_THROW("mlir tuning solutions must be strings");
+        if(not mlirRockTuningSetFromStr(mmodule.get(), make_mlir_string_ref(*str)))
+            MIGRAPHX_THROW("Failed setting tuning key: " + *str);
+    }
+
+    tuning_config get_tuning_config(bool exhaustive)
+    {
+        tuning_config tc;
+        run_high_level_pipeline();
+        auto tuning_mode =
+            exhaustive ? RocmlirTuningParamSetKindFull : RocmlirTuningParamSetKindQuick;
+        if(enabled(MIGRAPHX_MLIR_TUNE_EXHAUSTIVE{}))
+            tuning_mode = RocmlirTuningParamSetKindExhaustive;
+        mlir_tuning_space params{mlirRockTuningSpaceCreate(mmodule.get(), tuning_mode)};
+        const auto limit =
+            value_of(MIGRAPHX_MLIR_TUNE_LIMIT{}, std::numeric_limits<std::size_t>::max());
+        for(auto i : range(std::min<std::size_t>(limit, mlirRockTuningGetNumParams(params.get()))))
+        {
+            mlir_tuning_param param{mlirRockTuningParamCreate()};
+            if(not mlirRockTuningParamGet(params.get(), i, param.get()))
+                MIGRAPHX_THROW("Incorrect mlir tuning parameter: " + std::to_string(i));
+            std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> perf_key;
+            size_t perf_key_bytes =
+                mlirRockTuningParamToString(param.get(), perf_key.data(), perf_key.size());
+            if(perf_key_bytes > perf_key.size())
+                MIGRAPHX_THROW("Tuning perf key was " + std::to_string(perf_key_bytes) +
+                               " bytes and thus too long");
+            tc.solutions.emplace_back(
+                std::string(perf_key.begin(), perf_key.begin() + perf_key_bytes));
+        }
+        std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> tuning_key;
+        size_t tuning_key_bytes =
+            mlirRockTuningGetKey(mmodule.get(), tuning_key.data(), tuning_key.size());
+        if(tuning_key_bytes > tuning_key.size())
+            MIGRAPHX_THROW("Tuning table key was " + std::to_string(tuning_key_bytes) +
+                           " bytes and thus too long");
+        tc.problem = std::string(tuning_key.begin(), tuning_key.begin() + tuning_key_bytes);
+        return tc;
+    }
+
+    std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }
+
+    // This function appends to tuning cfg file that could be
+    // used with rocMLIR tuning scripts.
+    void dump_tuning_cfg(const std::string& prob_config) const
+    {
+        std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
+        if(not tuning_cfg_path.empty())
+        {
+            std::vector<std::string> tokens = split_string(prob_config, '\t');
+            std::string prob                = tokens[2];
+
+            if(starts_with(prob, "conv"))
+            {
+                tuning_cfg_path += ".conv";
+            }
+            else
+            {
+                tuning_cfg_path += ".gemm";
+            }
+            std::ofstream tuning_cfg(tuning_cfg_path, std::ios::app);
+            prob =
+                trim(prob, [](unsigned char c) { return (c == '\0') or (std::isspace(c) != 0); });
+            tuning_cfg << prob << std::endl;
+        }
+    }
+
+    static std::pair<mlir_tuning_table, bool> load_tuning_table()
+    {
+        mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
+        bool found_table           = false;
+        std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
+        if(not tuning_db_path.empty())
+        {
+            std::ifstream tuning_db_tsv(tuning_db_path);
+            if(tuning_db_tsv)
+            {
+                found_table = true;
+                std::string line;
+                while(std::getline(tuning_db_tsv, line))
+                {
+                    std::vector<std::string> tokens = split_string(line, '\t');
+                    std::string arch                = tokens[0];
+                    std::string num_cu              = tokens[1];
+                    std::string prob                = tokens[2];
+                    std::string perf                = tokens[3];
+                    std::string key = arch.append("\t").append(num_cu).append("\t").append(prob);
+                    mlirRockTuningUpdateTable(tuning_table.get(),
+                                              make_mlir_string_ref(key),
+                                              make_mlir_string_ref(perf),
+                                              1.0);
+                }
+            }
+        }
+        else
+        {
+            found_table = false;
+            std::cerr
+                << "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
+                   "optimal performance."
+                << std::endl;
+        }
+        return std::make_pair(std::move(tuning_table), found_table);
+    }
+
+    bool get_module_tuned() const
+    {
+        static std::pair<mlir_tuning_table, bool> tuning_table = load_tuning_table();
+        if(not mlirRockTuningSetFromTable(tuning_table.first.get(), mmodule.get()))
+        {
+            std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> prob_config;
+            size_t prob_config_bytes =
+                mlirRockTuningGetKey(mmodule.get(), prob_config.data(), prob_config.size());
+            if(prob_config_bytes >= prob_config.size())
+            {
+                std::cerr << "MLIR tuning key overflowed buffer, needed " << prob_config_bytes
+                          << " bytes" << std::endl;
+                return false;
+            }
+            std::string prob_config_str(prob_config.begin(),
+                                        prob_config.begin() + prob_config_bytes);
+            if(tuning_table.second)
+            {
+                std::cerr << "NOTE: MLIR tuning table did not include a key for " << prob_config_str
+                          << std::endl;
+            }
+            dump_tuning_cfg(prob_config_str);
+            return false;
+        }
+        return true;
+    }
+
+    mlir_context ctx;
+    MlirLocation location;
+    mlir_module mmodule;
+    mlir_logger logger;
+    problem_params pp;
+    std::deque<std::string> strings{};
+    std::string target_arch = "";
+    std::size_t num_cu      = 0;
+    std::string sym_name;
+};
+
+bool is_reduce(const instruction& ins) { return contains(ins.name(), "reduce"); }
+
+static void rewrite_reduce(module& m)
+{
+    for(auto i : iterator_for(m))
+    {
+        if(is_reduce(*i))
+        {
+            auto reduce_op   = i->get_operator().to_value();
+            auto reduce_axes = reduce_op["axes"].to_vector<size_t>();
+            auto reduce_lens = i->get_shape().lens();
+            auto in_shape    = i->inputs().front()->get_shape();
+            auto in_lens     = in_shape.lens();
+            assert(in_shape.standard());
+            assert(reduce_lens.size() == in_lens.size());
+            assert(std::adjacent_find(
+                       reduce_axes.begin(), reduce_axes.end(), [](auto axis_1, auto axis_2) {
+                           return axis_2 - axis_1 > 1;
+                       }) == reduce_axes.end());
+
+            std::vector<int64_t> new_rsp_dims;
+            std::vector<int64_t> new_reduce_axes;
+            for(const auto axis : range(in_shape.ndim()))
+            {
+                if(reduce_lens[axis] == in_lens[axis])
+                {
+                    new_rsp_dims.push_back(in_lens[axis]);
+                }
+                else if(new_reduce_axes.empty())
+                {
+                    assert(reduce_lens[axis] == 1);
+                    new_rsp_dims.push_back(-1);
+                    new_reduce_axes.push_back(axis);
+                }
+            }
+            auto rsp_ins = m.insert_instruction(
+                i, migraphx::make_op("reshape", {{"dims", new_rsp_dims}}), i->inputs().front());
+            auto collapsed_reduce = m.insert_instruction(
+                i, migraphx::make_op("reduce_sum", {{"axes", new_reduce_axes}}), rsp_ins);
+            auto rsp_back = m.insert_instruction(
+                i, migraphx::make_op("reshape", {{"dims", reduce_lens}}), collapsed_reduce);
+            m.replace_instruction(i, rsp_back);
+        }
+    }
+    migraphx::run_passes(m, {migraphx::dead_code_elimination{}});
+}
+
+bool is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution)
+{
+    auto mm = m;
+    rewrite_reduce(mm);
+    mlir_program mp;
+    mp.set_gpu_properties(migraphx_ctx);
+    mp.parse(mm);
+    mp.run_high_level_pipeline();
+    return mlirIsModuleFusible(mp.mmodule.get(), make_mlir_string_ref(*solution.if_string()));
+}
+
+void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
+{
+    auto names = m.get_parameter_names();
+    std::sort(names.begin(), names.end());
+    for(auto i : range(names.size()))
+    {
+        const auto& name  = names[i];
+        const auto& input = inputs[i];
+        auto param        = m.get_parameter(name);
+        assert(param->get_shape().standard());
+        if(input.standard())
+            continue;
+        auto new_param = m.add_parameter(name + ".0", input);
+        m.replace_instruction(param, new_param);
+        m.remove_instruction(param);
+    }
+}
+
+void replace_params_with_literals(module& m, const std::vector<instruction_ref>& inputs)
+{
+    auto names = m.get_parameter_names();
+    std::sort(names.begin(), names.end());
+    for(auto i : range(names.size()))
+    {
+        const auto& name  = names[i];
+        const auto& input = inputs[i];
+        if(input->name() != "@literal")
+            continue;
+        auto param = m.get_parameter(name);
+        auto lit   = m.add_literal(input->get_literal());
+        m.replace_instruction(param, lit);
+        m.remove_instruction(param);
+    }
+}
+
+std::string dump_mlir(module m, const std::vector<shape>& inputs)
+{
+    const_module_ref mr = &m;
+    if(not inputs.empty())
+    {
+        adjust_param_shapes(m, inputs);
+    }
+    rewrite_reduce(m);
+    mlir_program mp;
+    mp.parse(*mr);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+    return mlir_print(&mlirOperationPrint, mod_op);
+}
+
+static std::string compute_dump_name(const module& m, const std::string& ext)
+{
+    std::vector<instruction_ref> sizes;
+    for(auto ins : iterator_for(m))
+    {
+        if(contains({"quant_convolution", "quant_dot", "convolution", "dot"}, ins->name()))
+            sizes.insert(sizes.end(), ins->inputs().begin(), ins->inputs().end());
+    }
+    auto name =
+        mlir_program::get_symbol_name(m) + "_" + shape::to_sizes_string(to_shapes(sizes)) + ext;
+    replace_string_inplace(name, ", ", "_");
+    replace_string_inplace(name, ":", "s");
+    return name;
+}
+
+void dump_mlir_to_file(module m, const std::vector<shape>& inputs, const fs::path& location)
+{
+    static std::mutex mutex;
+    const std::lock_guard<std::mutex> lock(mutex);
+
+    if(not inputs.empty())
+    {
+        adjust_param_shapes(m, inputs);
+    }
+    rewrite_reduce(m);
+
+    auto name = compute_dump_name(m, ".mlir");
+    auto f    = location / name;
+    std::cout << "Dumping MLIR file to: " << f << std::endl;
+
+    mlir_program mp;
+    mp.parse(m);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+
+    std::string mlir_str = mlir_print(&mlirOperationPrint, mod_op);
+
+    write_string(f, mlir_str);
+}
+
+std::string dump_mlir(module m) { return dump_mlir(std::move(m), {}); }
+
+mlir_code_object compile_mlir(const context& migraphx_ctx,
+                              module m,
+                              const std::vector<shape>& in_shapes,
+                              const value& solution)
+{
+    adjust_param_shapes(m, in_shapes);
+    rewrite_reduce(m);
+    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+
+    static std::mutex mutex;
+    if(trace)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        std::cout << m << std::endl;
+    }
+
+    mlir_program mp;
+
+    mp.set_gpu_properties(migraphx_ctx);
+    mp.parse(m);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+    if(trace)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+    }
+    auto co = mp.compile(solution);
+
+    co.expected_inputs = in_shapes;
+    auto out_shapes    = m.get_output_shapes();
+    if(out_shapes.size() == 1)
+    {
+        co.output = m.get_output_shapes().front();
+    }
+    else
+    {
+        co.output = shape{out_shapes};
+    }
+    mlir_code_object mco;
+    mco.cop                 = co;
+    size_t num_prefill_args = mlirGetNumPrefillArgs(mp.mmodule.get());
+    if(num_prefill_args > 0)
+    {
+        std::vector<size_t> prefill_indices(num_prefill_args);
+        std::vector<MlirAttribute> prefill_mlir_values(num_prefill_args);
+        mlirGetPrefillArgsInfo(
+            mp.mmodule.get(), prefill_indices.data(), prefill_mlir_values.data(), num_prefill_args);
+        std::vector<value> prefill_values(prefill_mlir_values.size());
+        std::transform(prefill_mlir_values.begin(),
+                       prefill_mlir_values.end(),
+                       prefill_values.begin(),
+                       [](const auto& v) {
+                           // mlir sets fill attribute as float but migx hip::fill operator only
+                           // supports integer type.
+                           // TODO: Need to add checks that it is indeed an integer.
+                           double dv = mlirFloatAttrGetValueDouble(v);
+                           return static_cast<int>(dv);
+                       });
+        mco.prefill_indices = prefill_indices;
+        mco.prefill_values  = prefill_values;
+    }
+    return mco;
+}
+
+instruction_ref insert_mlir(module& m,
+                            instruction_ref ins,
+                            code_object_op co,
+                            const std::vector<instruction_ref>& inputs)
+{
+
+    std::vector<instruction_ref> refs;
+    std::size_t last = 0;
+    refs.reserve(inputs.size());
+    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
+    last               = refs.size() - 1;
+    co.expected_inputs = to_shapes(refs);
+    co.output_arg      = last;
+    return m.insert_instruction(ins, co, refs);
+}
+
+tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
+                                     module m,
+                                     const std::vector<shape>& inputs,
+                                     bool exhaustive)
+{
+    adjust_param_shapes(m, inputs);
+    rewrite_reduce(m);
+    mlir_program mp;
+    mp.set_gpu_properties(migraphx_ctx);
+    mp.parse(m);
+    auto tc          = mp.get_tuning_config(exhaustive);
+    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+    static std::mutex mutex;
+    if(trace)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        std::cout << "Problem: " << tc.problem << std::endl;
+        auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+    }
+    return tc;
+}
+
+void dump_mlir_to_mxr(module m,
+                      const std::vector<instruction_ref>& inputs,
+                      const fs::path& location)
+{
+    static std::mutex mutex;
+    const std::lock_guard<std::mutex> lock(mutex);
+
+    adjust_param_shapes(m, to_shapes(inputs));
+    replace_params_with_literals(m, inputs);
+    std::vector<instruction_ref> sizes;
+    for(auto ins : iterator_for(m))
+    {
+        if(contains({"quant_convolution", "quant_dot", "convolution", "dot"}, ins->name()))
+            sizes.insert(sizes.end(), ins->inputs().begin(), ins->inputs().end());
+    }
+    auto name = compute_dump_name(m, ".mxr");
+    auto f    = location / name;
+    std::cout << "Dumping MXR file to: " << f << std::endl;
+    save(program{std::move(m)}, f.string());
+}
+
+#else
+
+template <class T>
+void use(T&)
+{
+}
+
+std::string dump_mlir(module) { return {}; }
+
+std::string dump_mlir(module m, const std::vector<shape>& inputs)
+{
+    use(m);
+    use(inputs);
+    return {};
+}
+
+// Disabling clang-tidy warning on non-real useage.
+// NOLINTBEGIN(performance-unnecessary-value-param)
+mlir_code_object compile_mlir(const context&, module, const std::vector<shape>&, const value&)
+{
+    return {};
+}
+
+instruction_ref
+// cppcheck-suppress funcArgNamesDifferent
+insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)
+{
+    use(co);
+    use(m);
+    return m.end();
+}
+
+tuning_config get_tuning_config_mlir(const context&, module, const std::vector<shape>&, bool)
+{
+    return {};
+}
+// NOLINTEND(performance-unnecessary-value-param)
+
+#endif
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/multinomial.cpp b/docker/rocm/migraphx/targets/gpu/multinomial.cpp
new file mode 100644
index 000000000..51e5c48b4
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/multinomial.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/multinomial.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/check_shapes.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
+{
+    check_shapes{inputs, *this}.has(3).only_dims(2).standard();
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument
+hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/no_device.cpp b/docker/rocm/migraphx/targets/gpu/no_device.cpp
new file mode 100644
index 000000000..0ccdbac74
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/no_device.cpp
@@ -0,0 +1,28 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef __HIP_DEVICE_COMPILE__
+#error \
+    "Device compilation not allowed for migraphx_gpu. Do not link with hip::device. Device code should go into migraphx_device or migraphx_kernels"
+#endif
diff --git a/docker/rocm/migraphx/targets/gpu/nonzero.cpp b/docker/rocm/migraphx/targets/gpu/nonzero.cpp
new file mode 100644
index 000000000..0ff281f88
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/nonzero.cpp
@@ -0,0 +1,44 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/nonzero.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/nonzero.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
+{
+    return op.compute_shape({inputs.front()});
+}
+
+argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/pack_args.cpp b/docker/rocm/migraphx/targets/gpu/pack_args.cpp
new file mode 100644
index 000000000..2c3f41cf6
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/pack_args.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/pack_args.hpp>
+#include <migraphx/requires.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::vector<char> pack_args(const std::vector<kernel_argument>& args)
+{
+    std::vector<char> kernargs;
+    for(auto&& arg : args)
+    {
+        std::size_t n = arg.size;
+        const auto* p = static_cast<const char*>(arg.data);
+        // Insert padding
+        std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
+        kernargs.insert(kernargs.end(), padding, 0);
+        kernargs.insert(kernargs.end(), p, p + n);
+    }
+    return kernargs;
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/perfdb.cpp b/docker/rocm/migraphx/targets/gpu/perfdb.cpp
new file mode 100644
index 000000000..bdad925db
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/perfdb.cpp
@@ -0,0 +1,133 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/sqlite.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/permutation.hpp>
+#include <fstream>
+#include <mutex>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+namespace {
+
+std::string get_layout(const shape& s, std::string labels)
+{
+    auto result = labels;
+    auto p      = find_permutation(s);
+    std::transform(p.begin(), p.end(), result.begin(), [&](auto i) { return labels[i]; });
+    return "'" + result + "'";
+}
+
+std::string get_type(const shape& s)
+{
+    static const std::unordered_map<shape::type_t, std::string> m = {
+        {shape::float_type, "'FP32'"},
+        {shape::half_type, "'FP16'"},
+        {shape::double_type, "'FP64'"},
+        {shape::int8_type, "'INT8'"},
+        {shape::int32_type, "'INT32'"},
+    };
+    auto it = m.find(s.type());
+    if(it == m.end())
+        return "UNKNOWN";
+    return it->second;
+}
+
+std::string generate_miopen_config(const problem_params& pp)
+{
+    value v       = pp.op.to_value();
+    auto input    = pp.inputs[0].lens();
+    auto weights  = pp.inputs[1].lens();
+    auto padding  = v["padding"].to_vector<std::size_t>();
+    auto stride   = v["stride"].to_vector<std::size_t>();
+    auto dilation = v["dilation"].to_vector<std::size_t>();
+    if(padding.size() != stride.size())
+        padding.erase(padding.begin() + padding.size() / 2, padding.end());
+    return to_string_range({std::string{" C.in_channels="},       to_string(input[1]),
+                            std::string{" AND C.in_h="},          to_string(input[2]),
+                            std::string{" AND C.in_w="},          to_string(input[3]),
+                            std::string{" AND C.fil_h="},         to_string(weights[2]),
+                            std::string{" AND C.fil_w="},         to_string(weights[3]),
+                            std::string{" AND C.out_channels="},  to_string(weights[0]),
+                            std::string{" AND C.batchsize="},     to_string(input[0]),
+                            std::string{" AND C.pad_h="},         to_string(padding[0]),
+                            std::string{" AND C.pad_w="},         to_string(padding[2]),
+                            std::string{" AND C.dilation_h="},    to_string(dilation[0]),
+                            std::string{" AND C.dilation_w="},    to_string(dilation[1]),
+                            std::string{" AND C.conv_stride_h="}, to_string(stride[0]),
+                            std::string{" AND C.conv_stride_w="}, to_string(stride[1]),
+                            std::string{" AND C.layout="},        get_layout(pp.inputs[0], "NCHW"),
+                            std::string{" AND C.data_type="},     get_type(pp.inputs[0]),
+                            std::string{" AND C.direction="},     std::string{"'F'"}},
+                           " ");
+}
+
+auto query_miopen_db(const std::string& query)
+{
+    static std::mutex g_db_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_db_mutex);
+
+    // TODO: Store db as a static variable
+    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
+    // Check if db file exists.
+    std::ifstream dbs(dbpath);
+    if(dbs.is_open())
+    {
+        dbs.close();
+    }
+    else
+    {
+        std::vector<std::unordered_map<std::string, std::string>> empty;
+        return empty;
+    }
+
+    auto db = sqlite::read(dbpath);
+    return db.execute(query);
+}
+
+} // namespace
+
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops)
+{
+    std::string solver = xdlops ? "ConvMlirIgemmFwdXdlops" : "ConvMlirIgemmFwd";
+    std::string query  = "select P.* \
+                             from perf_db P, config C \
+                             where P.config = C.id AND \
+                             P.solver = '${solver}' AND \
+                             ${config}";
+
+    auto results = query_miopen_db(
+        interpolate_string(query, {{"config", generate_miopen_config(pp)}, {"solver", solver}}));
+    if(results.empty())
+        return "";
+    return results.front().at("params");
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/pooling.cpp b/docker/rocm/migraphx/targets/gpu/pooling.cpp
new file mode 100644
index 000000000..a6f86f077
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/pooling.cpp
@@ -0,0 +1,90 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/pooling.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_MIOPEN
+shape miopen_pooling::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    std::vector<shape> pooling_input = {inputs.at(0)};
+    check_shapes{pooling_input, *this}.max_ndims(5);
+    return op.normalize_compute_shape(pooling_input);
+}
+
+inline void reshape_if_1d(shape& input)
+{
+    auto dims = input.lens();
+
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        input = shape{input.type(), new_dims};
+    }
+}
+
+argument miopen_pooling::compute(context& ctx,
+                                 const shape& output_shape,
+                                 const std::vector<argument>& args) const
+{
+    shape x_shape = args[0].get_shape();
+    shape y_shape = output_shape;
+
+    reshape_if_1d(x_shape);
+    reshape_if_1d(y_shape);
+
+    auto x_desc = make_tensor(x_shape);
+    auto y_desc = make_tensor(y_shape);
+
+    float alpha = 1;
+    float beta  = 0;
+
+    miopenPoolingForward(ctx.get_stream().get_miopen(),
+                         pd.get(),
+                         &alpha,
+                         x_desc.get(),
+                         args[0].implicit(),
+                         &beta,
+                         y_desc.get(),
+                         args[1].implicit(),
+                         false,
+                         nullptr,
+                         0);
+
+    return args[1];
+}
+
+void miopen_pooling::finalize(context&, const shape&, const std::vector<shape>&)
+{
+    if(pd == nullptr)
+        pd = make_pooling(op);
+}
+#endif
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/prefuse_ops.cpp b/docker/rocm/migraphx/targets/gpu/prefuse_ops.cpp
new file mode 100644
index 000000000..f8a8f8375
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/prefuse_ops.cpp
@@ -0,0 +1,400 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/matcher.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/gpu/gemm_softmax_gemm.hpp>
+#include <migraphx/match/layernorm.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/op/group_query_attention.hpp>
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
+#include <migraphx/gpu/ck.hpp>
+#endif
+#include <migraphx/gpu/fuse_mlir.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_LAYERNORM_FUSION);
+
+namespace {
+
+template <class Derived, std::size_t N>
+struct layernorm_base
+{
+    float epsilon = 1e-12f;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.epsilon, "epsilon"));
+    }
+    shape compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
+    {
+        std::size_t nargs = N;
+        if(not mods.empty())
+        {
+            auto* pm = mods.front();
+            nargs += pm->get_parameter_names().size() - 1;
+        }
+        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(nargs);
+        auto s = inputs.front();
+        auto t = s.type();
+        if(not mods.empty())
+            t = mods.front()->get_output_shapes().front().type();
+
+        // Scalar output if all inputs are scalar
+        if(inputs.front().elements() == 1 and
+           all_of(inputs, [](const auto& ss) { return ss.scalar(); }))
+            return inputs.front();
+        auto l_s = shape::from_permutation(
+            t, s.lens(), find_permutation(std::vector<shape>(inputs.begin(), inputs.begin() + N)));
+        // just prelayernorm or preadd_layernorm
+        if(nargs <= N)
+            return l_s;
+        // else, layernorm + pointwise fusion, preserve layout of fused op
+        std::vector<shape> lp_s(inputs.begin() + N, inputs.end());
+        lp_s.insert(lp_s.begin(), l_s);
+        return shape::from_permutation(t, s.lens(), find_permutation(lp_s));
+    }
+};
+
+struct layernorm : layernorm_base<layernorm, 1>
+{
+
+    std::string name() const { return "gpu::prelayernorm"; }
+};
+MIGRAPHX_REGISTER_OP(layernorm);
+
+struct add_layernorm : layernorm_base<add_layernorm, 2>
+{
+    std::string name() const { return "gpu::preadd_layernorm"; }
+};
+MIGRAPHX_REGISTER_OP(add_layernorm);
+
+struct find_layernorm
+{
+    auto matcher() const { return match::layernorm(); }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        float eps  = 0;
+        if(contains(r.instructions, "eps"))
+            eps = r.instructions["eps"]->eval().at<float>();
+
+        m.replace_instruction(ins, layernorm{eps}, x_ins);
+    }
+};
+
+struct find_add_layernorm
+{
+    auto matcher() const
+    {
+        return match::name("gpu::prelayernorm")(
+            match::args(match::name("add")(match::used_once()).bind("add")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins     = r.result;
+        auto add_ins = r.instructions["add"];
+        auto op      = any_cast<layernorm>(ins->get_operator());
+
+        m.replace_instruction(ins, add_layernorm{op.epsilon}, add_ins->inputs());
+    }
+};
+
+struct pre_gemm_softmax_gemm : gemm_softmax_gemm
+{
+    std::string name() const { return "gpu::pre_gemm_softmax_gemm"; }
+};
+MIGRAPHX_REGISTER_OP(pre_gemm_softmax_gemm);
+
+auto is_ck_gemm()
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
+        if(not enabled(MIGRAPHX_ENABLE_CK{}))
+            return false;
+        if(ins->name() != "dot")
+            return false;
+        if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
+            return false;
+        return true;
+#else
+        (void)ins;
+        return false;
+#endif
+    });
+}
+
+auto is_test_gemm(bool enable_attention)
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(ins->name() != "dot")
+            return false;
+        return enable_attention;
+    });
+}
+
+auto is_bias_supported()
+{
+    return match::make_basic_pred_matcher([=](instruction_ref) {
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
+        return not enabled(MIGRAPHX_ENABLE_CK{});
+#else
+        return true;
+#endif
+    });
+}
+
+struct find_gemm_softmax_gemm
+{
+    bool enable_attention = false;
+
+    auto matcher() const
+    {
+        auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(
+            match::any_of(is_ck_gemm(), is_test_gemm(enable_attention)).bind("gemm1")));
+        auto mul   = match::name("mul")(
+            match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1));
+        auto where = match::name("where")(match::arg(2)(match::is_constant().bind("select_const")),
+                                          match::arg(1)(mul),
+                                          match::arg(0)(match::any().bind("select_cond")));
+        auto add =
+            match::name("add")(is_bias_supported(),
+                               match::nargs(2),
+                               match::either_arg(0, 1)(match::none_of(mul).bind("bias"), mul));
+        auto softmax = match::name("softmax")(match::arg(0)(match::any_of(mul, add, gemm1, where)))
+                           .bind("softmax");
+
+        return match::name("dot")(
+            match::any_of(is_ck_gemm(), is_test_gemm(enable_attention)).bind("gemm2"))(
+            match::arg(0)(softmax));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins       = r.result;
+        auto gemm2_ins = r.instructions["gemm2"];
+        auto gemm1_ins = r.instructions["gemm1"];
+
+        float scale = 1.0;
+        if(contains(r.instructions, "scale"))
+        {
+            auto scale_lit = r.instructions["scale"];
+            // CK only supports single-valued scale
+            scale_lit->eval().visit([&](const auto s) {
+                // CK only supports single-valued scale
+                if(not std::all_of(
+                       s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); }))
+                    return;
+                scale = s.front();
+            });
+        }
+
+        auto inputs = gemm1_ins->inputs(); // A, B
+        if(contains(r.instructions, "select_cond"))
+        {
+            inputs.push_back(r.instructions["select_cond"]);
+            inputs.push_back(r.instructions["select_const"]);
+        }
+        if(contains(r.instructions, "bias"))
+        {
+            inputs.push_back(r.instructions["bias"]);
+        }
+
+        inputs.push_back(gemm2_ins->inputs().back()); // B1
+
+        mpm.get_module().replace_instruction(
+            ins, pre_gemm_softmax_gemm{gemm2_ins->get_operator(), scale}, inputs);
+    }
+};
+
+struct gpu_compute_attention_probabilities : op::group_query_attention
+{
+    std::string name() const { return "gpu::compute_attention_probabilities"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        auto query_lens        = inputs.front().lens();
+        auto present_kv_seqlen = inputs.at(1).lens().at(2);
+        std::vector<std::size_t> output_lens{
+            query_lens.at(0), num_heads, query_lens.at(2), present_kv_seqlen};
+        shape output_shape{inputs.front().type(), output_lens};
+        return output_shape;
+    }
+};
+MIGRAPHX_REGISTER_OP(gpu_compute_attention_probabilities);
+
+struct gpu_compute_attention_scores : op::group_query_attention
+{
+    std::string name() const { return "gpu::compute_attention_scores"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        auto query_lens = inputs.front().lens();
+        std::size_t q_hidden_size =
+            (query_lens[1] * query_lens[3] * num_heads) / (num_heads + 2 * kv_num_heads);
+        std::vector<std::size_t> output_lens{query_lens.at(0), query_lens.at(2), q_hidden_size};
+        shape output_shape{inputs.front().type(), output_lens};
+        return output_shape;
+    }
+};
+MIGRAPHX_REGISTER_OP(gpu_compute_attention_scores);
+
+struct gpu_gqa_rotary_embedding : op::group_query_attention
+{
+    std::string name() const { return "gpu::gqa_rotary_embedding"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+};
+MIGRAPHX_REGISTER_OP(gpu_gqa_rotary_embedding);
+
+struct gpu_gqa_softmax : op::group_query_attention
+{
+    std::string name() const { return "gpu::gqa_softmax"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.at(2); }
+};
+MIGRAPHX_REGISTER_OP(gpu_gqa_softmax);
+
+struct gpu_concat_past_present : op::group_query_attention
+{
+    std::string name() const { return "gpu::concat_past_present"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return inputs[0]; }
+};
+MIGRAPHX_REGISTER_OP(gpu_concat_past_present);
+
+struct find_group_query_attention
+{
+    auto matcher() const { return match::name("group_query_attention"); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto inputs = ins->inputs();
+        auto v      = ins->get_operator().to_value();
+
+        auto num_heads          = v.at("num_heads").to<std::size_t>();
+        auto kv_num_heads       = v.at("kv_num_heads").to<std::size_t>();
+        auto do_rotary          = v.at("do_rotary").to<bool>();
+        auto local_window_size  = v.at("local_window_size").to<int>();
+        auto rotary_interleaved = v.at("rotary_interleaved").to<bool>();
+        auto scale              = v.at("scale").to<float>();
+
+        auto q_shape                      = inputs[0]->get_shape();
+        auto q_lens                       = q_shape.lens();
+        const std::size_t batch_size      = q_lens[0];
+        const std::size_t sequence_length = q_lens[1];
+        std::size_t q_hidden_size         = q_lens[2];
+        std::size_t head_size             = q_hidden_size / (num_heads + 2 * kv_num_heads);
+
+        std::vector<std::size_t> bsnh{
+            batch_size, sequence_length, num_heads + 2 * kv_num_heads, head_size};
+
+        auto transposed_qkv = mpm.get_module().insert_instruction(
+            ins, make_op("reshape", {{"dims", bsnh}}), inputs.at(0));
+
+        transposed_qkv = mpm.get_module().insert_instruction(
+            ins, make_op("transpose", {{"permutation", {0, 2, 1, 3}}}), transposed_qkv);
+
+        auto rotary_qkv = transposed_qkv;
+        if(do_rotary)
+        {
+            std::vector<instruction_ref> rotary_inputs{
+                transposed_qkv, inputs.at(5), inputs.at(7), inputs.at(8)};
+            rotary_qkv =
+                mpm.get_module().insert_instruction(ins,
+                                                    gpu_gqa_rotary_embedding{do_rotary,
+                                                                             kv_num_heads,
+                                                                             local_window_size,
+                                                                             num_heads,
+                                                                             rotary_interleaved,
+                                                                             scale},
+                                                    rotary_inputs);
+        }
+
+        auto pres_k = inputs.at(3);
+        auto pres_v = inputs.at(4);
+        std::vector<instruction_ref> concat_inputs{rotary_qkv, pres_k, pres_v, inputs.at(5)};
+
+        auto concat = mpm.get_module().insert_instruction(
+            ins,
+            gpu_concat_past_present{
+                do_rotary, kv_num_heads, local_window_size, num_heads, rotary_interleaved, scale},
+            concat_inputs);
+        auto id =
+            mpm.get_module().insert_instruction(ins, make_op("identity"), concat, pres_k, pres_v);
+
+        std::vector<instruction_ref> attn_probs_inputs{id, pres_k, pres_v, inputs.at(5)};
+        auto attn_probs = mpm.get_module().insert_instruction(
+            ins,
+            gpu_compute_attention_probabilities{
+                do_rotary, kv_num_heads, local_window_size, num_heads, rotary_interleaved, scale},
+            attn_probs_inputs);
+
+        std::vector<instruction_ref> softmax_inputs{rotary_qkv, pres_k, attn_probs, inputs.at(5)};
+        auto softmax = mpm.get_module().insert_instruction(
+            ins,
+            gpu_gqa_softmax{
+                do_rotary, kv_num_heads, local_window_size, num_heads, rotary_interleaved, scale},
+            softmax_inputs);
+        std::vector<instruction_ref> new_inputs{rotary_qkv, pres_k, pres_v, inputs.at(5), softmax};
+
+        auto get_tuple_elm_0 = std::next(ins);
+        auto get_tuple_elm_1 = std::next(get_tuple_elm_0);
+        auto get_tuple_elm_2 = std::next(get_tuple_elm_1);
+        mpm.get_module().replace_instruction(get_tuple_elm_2, pres_v);
+        mpm.get_module().replace_instruction(get_tuple_elm_1, pres_k);
+        mpm.get_module().replace_instruction(
+            get_tuple_elm_0,
+            gpu_compute_attention_scores{
+                do_rotary, kv_num_heads, local_window_size, num_heads, rotary_interleaved, scale},
+            new_inputs);
+    }
+};
+
+} // namespace
+
+void prefuse_ops::apply(module_pass_manager& mpm) const
+{
+    if(not enabled(MIGRAPHX_DISABLE_LAYERNORM_FUSION{}))
+    {
+        match::find_matches(mpm.get_module(), find_layernorm{});
+        mpm.run_pass(dead_code_elimination{});
+        match::find_matches(mpm.get_module(), find_add_layernorm{});
+    }
+    match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention});
+    match::find_matches(mpm, find_group_query_attention{});
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/prepare_reduce.cpp b/docker/rocm/migraphx/targets/gpu/prepare_reduce.cpp
new file mode 100644
index 000000000..bd5abd42b
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/prepare_reduce.cpp
@@ -0,0 +1,122 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#include <migraphx/gpu/prepare_reduce.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct parallel_reduce
+{
+    operation op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::parallel_reduce"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> result;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(result), [&](auto input) {
+            return op.compute_shape({input});
+        });
+        return shape{result};
+    }
+};
+MIGRAPHX_REGISTER_OP(parallel_reduce);
+
+namespace {
+
+std::vector<instruction_ref> find_reduce(module& m)
+{
+    std::vector<instruction_ref> result;
+    auto im = iterator_for(m);
+    std::copy_if(im.begin(), im.end(), std::back_inserter(result), [](auto ins) {
+        if(contains({"gpu::parallel_reduce", "reduce_mean"}, ins->name()))
+            return false;
+        return contains(ins->name(), "reduce");
+    });
+    return result;
+}
+
+std::vector<instruction_ref> find_parallel_reduce(const std::vector<instruction_ref>& r)
+{
+    std::vector<instruction_ref> result;
+    auto ir = iterator_for(r);
+    transform_if(
+        ir.begin(),
+        ir.end(),
+        std::back_inserter(result),
+        [&](auto x) {
+            return std::none_of(
+                std::next(x), r.end(), [&](auto reduce) { return reaches(*x, reduce); });
+        },
+        [](auto x) { return *x; });
+    return result;
+}
+
+void fuse_reductions(module& m)
+{
+    auto rs = find_parallel_reduce(find_reduce(m));
+    if(rs.size() < 2)
+        return;
+    // Only handle the same reduction operator for now
+    if(std::any_of(std::next(rs.begin()), rs.end(), [&](auto r) {
+           return rs.front()->name() != r->name();
+       }))
+        return;
+    auto last = rs.front();
+    auto op   = last->get_operator();
+    std::vector<instruction_ref> inputs;
+    std::transform(rs.begin(), rs.end(), std::back_inserter(inputs), [&](auto r) {
+        return r->inputs().front();
+    });
+    auto pr = m.insert_instruction(last, parallel_reduce{op}, inputs);
+    int i   = 0;
+    for(auto r : rs)
+    {
+        m.replace_instruction(r, make_op("get_tuple_elem", {{"index", i}}), pr);
+        i++;
+    }
+    m.sort();
+}
+
+} // namespace
+
+void prepare_reduce::apply(module& m) const { fuse_reductions(m); }
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/problem_cache.cpp b/docker/rocm/migraphx/targets/gpu/problem_cache.cpp
new file mode 100644
index 000000000..8eb25f3b8
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/problem_cache.cpp
@@ -0,0 +1,90 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#include <migraphx/gpu/problem_cache.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/json.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/serialize.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_PROBLEM_CACHE)
+
+void problem_cache::load()
+{
+    auto pc_path = string_value_of(MIGRAPHX_PROBLEM_CACHE{});
+    if(pc_path.empty())
+        return;
+    if(not fs::exists(pc_path))
+    {
+        std::cout << "Problem cache not found. Creating new file.\n";
+        return;
+    }
+    from_value(from_json_string(read_string(pc_path)), cache);
+}
+void problem_cache::save() const
+{
+    auto pc_path = string_value_of(MIGRAPHX_PROBLEM_CACHE{});
+    if(pc_path.empty())
+        return;
+    write_string(pc_path, to_pretty_json_string(to_value(cache)));
+}
+
+static value create_key(const std::string& name, const value& problem)
+{
+    return {{"name", name}, {"problem", problem}};
+}
+
+bool problem_cache::has(const std::string& name, const value& problem) const
+{
+    return contains(cache, create_key(name, problem));
+}
+
+void problem_cache::insert(const std::string& name, const value& problem, const value& solution)
+{
+    assert(not solution.is_null());
+    cache[create_key(name, problem)] = solution;
+}
+
+void problem_cache::mark(const std::string& name, const value& problem)
+{
+    cache.insert(std::make_pair(create_key(name, problem), value{}));
+}
+
+optional<value> problem_cache::get(const std::string& name, const value& problem) const
+{
+    auto it = cache.find(create_key(name, problem));
+    if(it == cache.end())
+        return nullopt;
+    return it->second;
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/reverse.cpp b/docker/rocm/migraphx/targets/gpu/reverse.cpp
new file mode 100644
index 000000000..ea70e3fbb
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/reverse.cpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/reverse.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/reverse.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_reverse::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.normalize_compute_shape(inputs);
+}
+
+argument hip_reverse::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::reverse(ctx.get_stream().get(), args.back(), args[0], op.axes);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/rnn_variable_seq_lens.cpp b/docker/rocm/migraphx/targets/gpu/rnn_variable_seq_lens.cpp
new file mode 100644
index 000000000..e251716f9
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/rnn_variable_seq_lens.cpp
@@ -0,0 +1,84 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/rnn_variable_seq_lens.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/rnn_variable_seq_lens.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_rnn_var_sl_shift_output::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_rnn_var_sl_shift_output::compute(context& ctx,
+                                              const shape&,
+                                              const std::vector<argument>& args) const
+{
+    device::rnn_var_sl_shift_output(ctx.get_stream().get(),
+                                    args.back(),
+                                    args.at(0),
+                                    args.at(1),
+                                    (op.direction == op::rnn_direction::reverse));
+    return args.back();
+}
+
+shape hip_rnn_var_sl_shift_sequence::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_rnn_var_sl_shift_sequence::compute(context& ctx,
+                                                const shape&,
+                                                const std::vector<argument>& args) const
+{
+    device::rnn_var_sl_shift_sequence(ctx.get_stream().get(), args.back(), args.at(0), args.at(1));
+    return args.back();
+}
+
+shape hip_rnn_var_sl_last_output::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_rnn_var_sl_last_output::compute(context& ctx,
+                                             const shape&,
+                                             const std::vector<argument>& args) const
+{
+    device::rnn_var_sl_last_output(ctx.get_stream().get(),
+                                   args.back(),
+                                   args.at(0),
+                                   args.at(1),
+                                   (op.direction == op::rnn_direction::reverse));
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/rocblas.cpp b/docker/rocm/migraphx/targets/gpu/rocblas.cpp
new file mode 100644
index 000000000..8c06ad51f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/rocblas.cpp
@@ -0,0 +1,72 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/ranges.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+#if MIGRAPHX_USE_ROCBLAS
+rocblas_handle_ptr create_rocblas_handle_ptr()
+{
+    // add a call to rocblas_initialize() to workaround a rocblas bug SWDEV-438929
+    rocblas_initialize();
+    rocblas_handle handle;
+    rocblas_create_handle(&handle);
+    return rocblas_handle_ptr{handle};
+}
+
+rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
+{
+    rocblas_handle_ptr rb = create_rocblas_handle_ptr();
+    rocblas_set_stream(rb.get(), s);
+    return rb;
+}
+#endif
+bool get_compute_fp32_flag()
+{
+    const auto device_name = trim(split_string(get_device_name(), ':').front());
+    return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
+}
+
+bool rocblas_fp8_available()
+{
+#if MIGRAPHX_USE_ROCBLAS
+#ifndef MIGRAPHX_USE_ROCBLAS_FP8_API
+    return false;
+#else
+    return gfx_has_fp8fnuz_intrinsics();
+#endif
+#else
+    return false;
+#endif
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/schedule_model.cpp b/docker/rocm/migraphx/targets/gpu/schedule_model.cpp
new file mode 100644
index 000000000..aa59a693f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/schedule_model.cpp
@@ -0,0 +1,156 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/schedule_model.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct record_event
+{
+    std::size_t event = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::record_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.get_stream().record(ctx.get_event(event));
+        return {};
+    }
+
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) const
+    {
+        ctx.create_events(event);
+    }
+};
+
+struct wait_event
+{
+    std::size_t event = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::wait_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.get_stream().wait(ctx.get_event(event));
+        return {};
+    }
+};
+
+struct set_stream
+{
+    std::size_t stream = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.stream, "stream"));
+    }
+    std::string name() const { return "gpu::set_stream"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.set_stream(stream);
+        return {};
+    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) const
+    {
+        ctx.set_stream(stream);
+    }
+};
+
+MIGRAPHX_REGISTER_OP(record_event)
+MIGRAPHX_REGISTER_OP(wait_event)
+MIGRAPHX_REGISTER_OP(set_stream)
+
+std::size_t schedule_model::concurrency() const { return streams; }
+void schedule_model::sched(module& m, instruction_ref ins, std::size_t n) const
+{
+    auto last_stream = std::find_if(std::make_reverse_iterator(ins),
+                                    std::make_reverse_iterator(m.begin()),
+                                    [&](auto&& i) { return i.name() == "gpu::set_stream"; });
+    if(last_stream != std::make_reverse_iterator(m.begin()))
+    {
+        auto&& op = any_cast<set_stream>(last_stream->get_operator());
+        // If the same stream was set earlier then skip
+        if(op.stream == n)
+            return;
+    }
+    m.insert_instruction(ins, set_stream{n});
+}
+
+void schedule_model::wait(module& m, instruction_ref ins, std::size_t wait_id) const
+{
+    m.insert_instruction(ins, wait_event{wait_id});
+}
+void schedule_model::record(module& m, instruction_ref ins, std::size_t wait_id) const
+{
+    m.insert_instruction(std::next(ins), record_event{wait_id});
+}
+
+static std::unordered_map<std::string, std::size_t> create_weight_map()
+{
+    return {{"hip::load_literal", 0},
+            {"hip::hip_allocate_memory", 0},
+            {"hip::hip_load_memory", 0},
+            {"hip::allocate", 0},
+            {"gpu::convolution", 8},
+            {"gpu::conv_bias_relu", 8},
+            {"gpu::pooling", 4},
+            {"gpu::gemm", 4}};
+}
+
+static const std::unordered_map<std::string, std::size_t>& weight_map()
+{
+    static const std::unordered_map<std::string, std::size_t> m = create_weight_map();
+    return m;
+}
+
+std::size_t schedule_model::weight(const operation& op) const
+{
+    if(weight_map().count(op.name()) == 0)
+    {
+        return 2;
+    }
+    return weight_map().at(op.name());
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/sync_device.cpp b/docker/rocm/migraphx/targets/gpu/sync_device.cpp
new file mode 100644
index 000000000..4e8a176eb
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/sync_device.cpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/sync_device.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/iterator_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+void sync_device::apply(module& m) const
+{
+    auto last = std::prev(m.end());
+    if(last->name() == "@return")
+    {
+        auto inputs = last->inputs();
+        if(std::any_of(inputs.begin(), inputs.end(), [](auto i) {
+               return (i->name() == "hip::copy_from_gpu");
+           }))
+        {
+            auto sync_in = m.insert_instruction(last, make_op("hip::sync_stream"), inputs);
+            if(not inputs.empty())
+            {
+                m.replace_instruction(inputs.front(), sync_in);
+            }
+        }
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/target.cpp b/docker/rocm/migraphx/targets/gpu/target.cpp
new file mode 100644
index 000000000..bf2fc3e86
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/target.cpp
@@ -0,0 +1,280 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/adjust_allocation.hpp>
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/check_context.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_allocation.hpp>
+#include <migraphx/eliminate_concat.hpp>
+#include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/eliminate_data_type.hpp>
+#include <migraphx/eliminate_identity.hpp>
+#include <migraphx/eliminate_pad.hpp>
+#include <migraphx/fp8_ocp_to_fnuz.hpp>
+#include <migraphx/fuse_concat.hpp>
+#include <migraphx/fuse_pointwise_reduce.hpp>
+#include <migraphx/inline_module.hpp>
+#include <migraphx/insert_pad.hpp>
+#include <migraphx/layout_convolution.hpp>
+#include <migraphx/memory_coloring.hpp>
+#include <migraphx/normalize_ops.hpp>
+#include <migraphx/optimize_module.hpp>
+#include <migraphx/preallocate_param.hpp>
+#include <migraphx/promote_literals.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/replace_allocate.hpp>
+#include <migraphx/rewrite_gelu.hpp>
+#include <migraphx/rewrite_low_precision.hpp>
+#include <migraphx/rewrite_pooling.hpp>
+#include <migraphx/rewrite_reduce.hpp>
+#include <migraphx/rewrite_quantization.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/schedule.hpp>
+#include <migraphx/simplify_dyn_ops.hpp>
+#include <migraphx/simplify_qdq.hpp>
+#include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/split_reduce.hpp>
+#include <migraphx/split_single_dyn_dim.hpp>
+#include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_hipblaslt.hpp>
+#include <migraphx/gpu/compile_miopen.hpp>
+#include <migraphx/gpu/compile_ops.hpp>
+#include <migraphx/gpu/concat_gpu_opt.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/schedule_model.hpp>
+#include <migraphx/gpu/sync_device.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/gpu/write_literals.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
+#ifndef _WIN32
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
+#endif
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPBLASLT_GEMM)
+
+std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
+{
+    auto& ctx = any_cast<context>(gctx);
+    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
+    ctx.load_problem_cache();
+    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
+    unsupported_types.erase(shape::type_t::float_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fnuz_type);
+    unsupported_types.erase(shape::type_t::fp8e5m2fnuz_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fn_type);
+    unsupported_types.erase(shape::type_t::fp8e5m2_type);
+    unsupported_types.erase(shape::type_t::half_type);
+    unsupported_types.erase(shape::type_t::bool_type);
+    unsupported_types.erase(shape::type_t::int8_type);
+    unsupported_types.erase(shape::type_t::uint8_type);
+    unsupported_types.erase(shape::type_t::int32_type);
+    unsupported_types.erase(shape::type_t::tuple_type);
+    unsupported_types.erase(shape::type_t::bf16_type);
+
+    // whiltelist supported Ops for the FP8 types
+    // different between fp8e4m3fnuz and OCP types because rocBLAS only has
+    // support for fp8e4m3fnuz
+    std::set<std::string> unsupported_fp8e4m3fnuz_ops = {};
+    if(not enabled(MIGRAPHX_ENABLE_HIPBLASLT_GEMM{}) and not gpu::rocblas_fp8_available())
+    {
+        unsupported_fp8e4m3fnuz_ops.insert("dot");
+        unsupported_fp8e4m3fnuz_ops.insert("quant_dot");
+    }
+#if MIGRAPHX_USE_MIOPEN
+    // MIOpen doesn't have support for fp8 pooling yet.
+    unsupported_fp8e4m3fnuz_ops.insert("pooling");
+#endif
+    if(not gpu::gfx_has_fp8fnuz_intrinsics())
+    {
+        unsupported_fp8e4m3fnuz_ops.insert("convolution");
+        unsupported_fp8e4m3fnuz_ops.insert("quant_convolution");
+    }
+    // add all device kernels
+    unsupported_fp8e4m3fnuz_ops.insert("logsoftmax");
+    unsupported_fp8e4m3fnuz_ops.insert("nonzero");
+    unsupported_fp8e4m3fnuz_ops.insert("prefix_scan_sum");
+    unsupported_fp8e4m3fnuz_ops.insert("scatter_none");
+    unsupported_fp8e4m3fnuz_ops.insert("topk");
+    unsupported_fp8e4m3fnuz_ops.insert("rnn_var_sl_shift_output");
+    unsupported_fp8e4m3fnuz_ops.insert("multinomial");
+    unsupported_fp8e4m3fnuz_ops.insert("argmax");
+    unsupported_fp8e4m3fnuz_ops.insert("argmin");
+
+    std::set<std::string> unsupported_fp8e5m2fnuz_ops = unsupported_fp8e4m3fnuz_ops;
+    // disable gemm for fp8e5m2fnuz if rocBLAS is being used
+    if(not enabled(MIGRAPHX_ENABLE_HIPBLASLT_GEMM{}))
+    {
+        unsupported_fp8e5m2fnuz_ops.insert("dot");
+        unsupported_fp8e5m2fnuz_ops.insert("quant_dot");
+    }
+
+    std::set<std::string> unsupported_fp8ocp_ops = {};
+    // TODO: remove this when the flag is removed
+    if(not enabled(MIGRAPHX_ENABLE_HIPBLASLT_GEMM{}))
+    {
+        unsupported_fp8ocp_ops.insert("dot");
+        unsupported_fp8ocp_ops.insert("quant_dot");
+    }
+#if MIGRAPHX_USE_MIOPEN
+    // MIOpen doesn't have support for fp8 pooling yet.
+    unsupported_fp8ocp_ops.insert("pooling");
+#endif
+    if(not gpu::gfx_has_fp8ocp_intrinsics())
+    {
+        unsupported_fp8ocp_ops.insert("convolution");
+        unsupported_fp8ocp_ops.insert("quant_convolution");
+        unsupported_fp8ocp_ops.insert("dot");
+        unsupported_fp8ocp_ops.insert("quant_dot");
+    }
+    // add all device kernels
+    unsupported_fp8ocp_ops.insert("logsoftmax");
+    unsupported_fp8ocp_ops.insert("nonzero");
+    unsupported_fp8ocp_ops.insert("prefix_scan_sum");
+    unsupported_fp8ocp_ops.insert("scatter_none");
+    unsupported_fp8ocp_ops.insert("topk");
+    unsupported_fp8ocp_ops.insert("rnn_var_sl_shift_output");
+    unsupported_fp8ocp_ops.insert("multinomial");
+    unsupported_fp8ocp_ops.insert("argmax");
+    unsupported_fp8ocp_ops.insert("argmin");
+
+    // clang-format off
+    return
+    {
+        split_single_dyn_dim{},
+        dead_code_elimination{},
+        simplify_dyn_ops{},
+        dead_code_elimination{},
+        normalize_ops{},
+        dead_code_elimination{},
+        eliminate_identity{},
+        dead_code_elimination{},
+        enable_pass(not gpu::gfx_has_fp8ocp_intrinsics() and gpu::gfx_has_fp8fnuz_intrinsics(), fp8_ocp_to_fnuz{}),
+        enable_pass(not gpu::gfx_has_fp8ocp_intrinsics() and gpu::gfx_has_fp8fnuz_intrinsics(), dead_code_elimination{}),
+        simplify_qdq{},
+        enable_pass(not mlir_enabled(), rewrite_quantization{}),
+        dead_code_elimination{},
+        // workaround for rocBLAS unsupported error when using uint8 in quant_dot, quant_convolution & pooling
+        eliminate_data_type{{migraphx::shape::uint8_type}, shape::float_type, {"quant_convolution", "quant_dot", "pooling"}},
+        eliminate_data_type{unsupported_types, shape::type_t::float_type},
+        simplify_reshapes{},
+        eliminate_identity{},
+        eliminate_pad{},
+        dead_code_elimination{},
+        insert_pad{{"convolution"}},
+        dead_code_elimination{},
+        rewrite_rnn{},
+        dead_code_elimination{},
+        inline_module{},
+        rewrite_pooling{},
+        dead_code_elimination{},
+        rewrite_gelu{options.fast_math},
+        optimize_module{},
+        layout_convolution{.channels_last = enabled(MIGRAPHX_ENABLE_NHWC{})},
+        dead_code_elimination{},
+        prefuse_ops{},
+        dead_code_elimination{},
+        eliminate_data_type{{migraphx::shape::fp8e4m3fnuz_type}, shape::float_type, unsupported_fp8e4m3fnuz_ops},
+        eliminate_data_type{{migraphx::shape::fp8e5m2fnuz_type}, shape::float_type, unsupported_fp8e5m2fnuz_ops},
+        eliminate_data_type{{migraphx::shape::fp8e4m3fn_type, migraphx::shape::fp8e5m2_type}, shape::float_type, unsupported_fp8ocp_ops},
+        dead_code_elimination{},
+        rewrite_reduce{},
+        rewrite_low_precision{},
+        dead_code_elimination{},
+        optimize_module{},
+        fuse_pointwise_reduce{},
+        dead_code_elimination{},
+#ifndef _WIN32
+        enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
+#endif
+        dead_code_elimination{},
+        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
+        dead_code_elimination{},
+        fuse_concat{},
+        dead_code_elimination{},
+        auto_contiguous{},
+        dead_code_elimination{},
+        lowering{&ctx, options.offload_copy},
+        eliminate_contiguous{"gpu::contiguous"},
+        dead_code_elimination{},
+        eliminate_concat{concat_gpu_optimization{}},
+        dead_code_elimination{},
+#if MIGRAPHX_USE_MIOPEN
+        compile_miopen{&gctx},
+        dead_code_elimination{},
+#endif
+        fuse_ops{&ctx, options.fast_math},
+        dead_code_elimination{},
+#if MIGRAPHX_USE_HIPBLASLT
+        compile_hipblaslt{&gctx},
+        dead_code_elimination{},
+#endif
+        replace_allocate{gpu_allocation_model{}, options.offload_copy},
+        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
+        compile_ops{&ctx, options.exhaustive_tune},
+        dead_code_elimination{},
+        promote_literals{},
+        dead_code_elimination{},
+        write_literals{&ctx},
+        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
+        memory_coloring{"hip::allocate"},
+        sync_device{},
+        preallocate_param{"scratch", gpu_allocation_model{}},
+        dead_code_elimination{},
+        eliminate_allocation{"hip::allocate"},
+        check_context<context>{},
+        normalize_ops{},
+        dead_code_elimination{},
+        eliminate_identity{}
+    };
+    // clang-format on
+}
+
+std::string target::name() const { return "gpu"; }
+
+migraphx::context target::get_context() const { return context(gpu::get_device_id()); }
+
+argument target::copy_to(const argument& arg) const { return gpu::to_gpu(arg); }
+
+argument target::copy_from(const argument& arg) const { return gpu::from_gpu(arg); }
+
+argument target::allocate(const shape& s) const { return gpu::allocate_gpu(s); }
+
+MIGRAPHX_REGISTER_TARGET(target);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/time_op.cpp b/docker/rocm/migraphx/targets/gpu/time_op.cpp
new file mode 100644
index 000000000..3b37cfb1f
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/time_op.cpp
@@ -0,0 +1,101 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/program.hpp>
+#include <migraphx/gpu/time_op.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/time.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::vector<argument> generate_arguments(const std::vector<shape>& shapes,
+                                         unsigned long seed = 0,
+                                         random_mode rm     = random_mode::random)
+{
+    std::vector<argument> args;
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](const auto& s) {
+        return to_gpu(generate_argument(s, seed++, rm));
+    });
+    return args;
+}
+
+template <class F>
+double time_loop(migraphx::gpu::context& gctx, int n, F f)
+{
+    auto start = context::create_event_for_timing();
+    auto stop  = context::create_event_for_timing();
+    f();
+    gctx.get_stream().record(start.get());
+    for(auto i : range(n))
+    {
+        (void)i;
+        f();
+    }
+    gctx.get_stream().record(stop.get());
+    gctx.finish();
+    return context::get_elapsed_ms(start.get(), stop.get()) / n;
+}
+
+double time_op(const context& ictx, operation op, const std::vector<shape>& inputs, int n)
+{
+    // TODO: Use std::ref
+    migraphx::context ctx = ictx;
+    auto& gctx            = any_cast<migraphx::gpu::context>(ctx);
+    auto output           = op.compute_shape(inputs);
+    op.finalize(ctx, output, inputs);
+    auto args = generate_arguments(inputs);
+    auto run  = [&] { op.compute(ctx, output, args); };
+    return time_loop(gctx, n, run);
+}
+
+double time_op(const context& ictx, operation op, int n)
+{
+    auto inputs = any_cast<migraphx::gpu::code_object_op>(op).expected_inputs;
+    return time_op(ictx, op, inputs, n);
+}
+
+double time_program(const context& ictx, program p, int n)
+{
+    std::vector<migraphx::context> ctx_vec = {ictx};
+    auto& gctx                             = any_cast<migraphx::gpu::context>(ctx_vec.front());
+    auto* mm                               = p.get_main_module();
+    mm->finalize(ctx_vec);
+    auto in_shapes = p.get_parameter_shapes();
+    std::unordered_map<std::string, migraphx::argument> param_map;
+    unsigned long seed = 0;
+    for(const auto& [name, shape] : in_shapes)
+    {
+        param_map[name] = to_gpu(generate_argument(shape, seed++, random_mode::random));
+    }
+    auto run = [&] { p.eval_with_context(ctx_vec, param_map); };
+    return time_loop(gctx, n, run);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/topk.cpp b/docker/rocm/migraphx/targets/gpu/topk.cpp
new file mode 100644
index 000000000..2e799c650
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/topk.cpp
@@ -0,0 +1,56 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/topk.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/topk.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_topk::compute_shape(std::vector<shape> inputs) const
+{
+    return op.normalize_compute_shape({inputs.front()});
+}
+
+argument hip_topk::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto outputs = args.back().get_sub_objects();
+    return op.largest ? device::topk_largest(ctx.get_stream().get(),
+                                             outputs.front(),
+                                             outputs.back(),
+                                             args[0],
+                                             op.k,
+                                             op.axis)
+                      : device::topk_smallest(ctx.get_stream().get(),
+                                              outputs.front(),
+                                              outputs.back(),
+                                              args[0],
+                                              op.k,
+                                              op.axis);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/gpu/write_literals.cpp b/docker/rocm/migraphx/targets/gpu/write_literals.cpp
new file mode 100644
index 000000000..cbc776737
--- /dev/null
+++ b/docker/rocm/migraphx/targets/gpu/write_literals.cpp
@@ -0,0 +1,64 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/write_literals.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/env.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_COPY_LITERALS)
+
+void write_literals::apply(module& m) const
+{
+    assert(ctx != nullptr);
+    std::size_t n = 0;
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() == "@literal")
+        {
+            if(enabled(MIGRAPHX_COPY_LITERALS{}))
+            {
+                literal l  = ins->get_literal();
+                auto pre   = m.add_literal(l);
+                auto alloc = m.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
+                m.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
+            }
+            else
+            {
+                std::string id = m.name() + ":@literal:" + std::to_string(n);
+                m.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
+                n++;
+            }
+        }
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/ref/CMakeLists.txt b/docker/rocm/migraphx/targets/ref/CMakeLists.txt
new file mode 100644
index 000000000..d4b3e63c7
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/CMakeLists.txt
@@ -0,0 +1,44 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+add_library(migraphx_ref
+    target.cpp
+    lowering.cpp
+)
+set_target_properties(migraphx_ref PROPERTIES EXPORT_NAME ref)
+rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION})
+
+rocm_clang_tidy_check(migraphx_ref)
+target_link_libraries(migraphx_ref PRIVATE Threads::Threads)
+target_link_libraries(migraphx_ref PUBLIC migraphx)
+
+migraphx_generate_export_header(migraphx_ref)
+
+rocm_install_targets(
+  PRIVATE
+  TARGETS migraphx_ref
+  INCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
diff --git a/docker/rocm/migraphx/targets/ref/include/migraphx/ref/context.hpp b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/context.hpp
new file mode 100644
index 000000000..8c2cdfe9d
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/context.hpp
@@ -0,0 +1,43 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/ref/export.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace ref {
+
+struct context
+{
+    void finish() const {}
+};
+
+} // namespace ref
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/ref/include/migraphx/ref/lowering.hpp b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/lowering.hpp
new file mode 100644
index 000000000..a775fed15
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/lowering.hpp
@@ -0,0 +1,44 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
+
+#include <migraphx/ref/context.hpp>
+#include <migraphx/program.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace ref {
+
+struct MIGRAPHX_REF_EXPORT lowering
+{
+    std::string name() const { return "ref::lowering"; }
+    void apply(module& m) const;
+};
+
+} // namespace ref
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/ref/include/migraphx/ref/target.hpp b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/target.hpp
new file mode 100644
index 000000000..b31b7f9d1
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/include/migraphx/ref/target.hpp
@@ -0,0 +1,53 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/compile_options.hpp>
+#include <migraphx/ref/context.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct pass;
+namespace ref {
+
+struct MIGRAPHX_REF_EXPORT target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
+    migraphx::context get_context() const { return context{}; }
+
+    argument copy_to(const argument& arg) const { return arg; }
+    argument copy_from(const argument& arg) const { return arg; }
+    argument allocate(const shape& s) const;
+};
+
+} // namespace ref
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/targets/ref/lowering.cpp b/docker/rocm/migraphx/targets/ref/lowering.cpp
new file mode 100644
index 000000000..a0b6b4bd0
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/lowering.cpp
@@ -0,0 +1,504 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/ref/lowering.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/im2col.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/op/loop.hpp>
+#include <migraphx/op/lrn.hpp>
+#include <migraphx/op/pad.hpp>
+#include <migraphx/op/softmax.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/par_dfor.hpp>
+#include <migraphx/clamp.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/pad_calc.hpp>
+
+#include <unordered_map>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace ref {
+
+template <typename T>
+T zero(const T&)
+{
+    return T(0);
+}
+
+template <class T>
+typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::enable_if<true, T>>::
+    type
+    make_signed(T x)
+{
+    return x;
+}
+
+struct ref_lrn
+{
+    op::lrn op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "ref::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            int n_batch         = output_shape.lens()[0];
+            int channels        = output_shape.lens()[1];
+            int height          = output_shape.lens()[2];
+            int width           = output_shape.lens()[3];
+            float alphaoverarea = op.alpha / float(op.size);
+            int radius_lower    = (op.size - 1) / 2;
+            int radius_upper    = op.size / 2 + 1;
+
+            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
+                float scale = 0;
+                dfor(channels)([&](int c) {
+                    auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
+                    auto end   = (c + radius_upper) > channels ? channels : (c + radius_upper);
+                    for(auto k = start; k < end; ++k)
+                    {
+                        scale += std::pow(input(b, k, h, w), 2);
+                    }
+                    scale *= alphaoverarea;
+                    scale += op.bias;
+                    scale              = std::pow(scale, -op.beta);
+                    output(b, c, h, w) = input(b, c, h, w) * scale;
+                });
+            });
+        });
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_lrn)
+
+template <class V, class T, class... Ts>
+void visit_quantize_impl(V&& v, T&& x, Ts&&... xs)
+{
+    x.visit([&](auto y) { visit_all(xs...)([&](auto... ys) { v(y, ys...); }); });
+}
+
+template <class T, class... Ts>
+auto visit_quantize(T&& x, Ts&&... xs)
+{
+    return [&](auto v) {
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70100
+        visit_quantize_impl(v, x, xs...);
+    };
+}
+
+struct ref_im2col
+{
+    op::im2col op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    static std::string name() { return "ref::im2col"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        return op.normalize_compute_shape(inputs);
+    }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto input_shape   = args[0].get_shape();
+        auto weights_shape = args[1].get_shape();
+        visit_all(result, args[0])([&](auto col, auto input) {
+            const std::size_t& height   = input_shape.lens()[2];
+            const std::size_t& width    = input_shape.lens()[3];
+            const std::size_t& channels = weights_shape.lens()[1];
+            const std::size_t& kernel_h = weights_shape.lens()[2];
+            const std::size_t& kernel_w = weights_shape.lens()[3];
+            const std::size_t& pad_h    = op.padding[0];
+            const std::size_t& pad_w    = op.padding[1];
+            const std::size_t& stride_h = op.stride[0];
+            const std::size_t& stride_w = op.stride[1];
+
+            long kdiv2_h = long(kernel_h) / 2;
+            long kdiv2_w = long(kernel_w) / 2;
+            // calculate output sizes
+            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
+            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
+            // account for padding for the starting position of the input pixels
+            long iinput = kdiv2_h - long(pad_h);
+            // loop over output pixels (ioutput, joutput)
+            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
+            {
+                long jinput = kdiv2_w - long(pad_w);
+                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
+                {
+                    // compute linear index for output
+                    std::size_t ldx = ioutput * col_width + joutput;
+                    std::size_t p   = 0;
+                    dfor(channels,
+                         kernel_h,
+                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
+                        auto idx    = iinput + long(koffset) - kdiv2_h;
+                        auto jdx    = jinput + long(loffset) - kdiv2_w;
+                        col(ldx, p) =
+                            ((idx >= 0) and (idx < height) and (jdx >= 0) and (jdx < width))
+                                ? input(0, c, idx, jdx)
+                                : 0;
+                        p++;
+                    });
+                }
+            }
+        });
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_im2col)
+
+struct ref_op
+{
+    operation op = op::identity{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "ref::op"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        return op.compute(output_shape, args);
+    }
+    value to_value() const
+    {
+        value v;
+        v["name"]     = op.name();
+        v["operator"] = op.to_value();
+        return v;
+    }
+    void from_value(const value& v)
+    {
+        op = make_op(v.at("name").to<std::string>(), v.at("operator"));
+    }
+    friend std::ostream& operator<<(std::ostream& os, const ref_op& x)
+    {
+        os << "ref::" << x.op;
+        return os;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_op)
+
+struct ref_pad
+{
+    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "ref::pad"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        assert(dyn_out.computed_shape.standard());
+        argument result{dyn_out.computed_shape};
+        result.visit([&](auto output) {
+            using type = typename decltype(output)::value_type;
+            std::fill(output.begin(), output.end(), pad_clamp<type>(op.value));
+        });
+
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(input.get_shape(), [&](const auto& idx) {
+                std::vector<std::size_t> new_idx(idx.size());
+                std::transform(
+                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
+                        return i + j;
+                    });
+                output(new_idx.begin(), new_idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_pad)
+
+struct ref_gemm
+{
+    op::dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "ref::dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        argument result{dyn_out.computed_shape};
+        visit_all(result, args[0], args[1])(
+            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, 1.0f, 0.0f); });
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_gemm)
+
+struct ref_quant_gemm
+{
+    op::quant_dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "ref::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto cmat) {
+            visit_all(args.at(0), args.at(1))(
+                [&](auto amat, auto bmat) { return gemm(cmat, amat, bmat, 1.0f, 0.0f); });
+        });
+        return result;
+    }
+};
+
+MIGRAPHX_REGISTER_OP(ref_gemm)
+
+template <class Op>
+struct ref_softmax : auto_register_op<ref_softmax<Op>>
+{
+    ref_softmax() = default;
+
+    ref_softmax(Op pop) : op(std::move(pop)) {}
+
+    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "ref::" + op.name(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        return op.normalize_compute_shape(inputs);
+    }
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        argument result{dyn_out.computed_shape};
+        auto batch_lens        = dyn_out.computed_shape.lens();
+        int64_t tuned_axis     = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
+        std::size_t n_dims     = batch_lens[tuned_axis];
+        batch_lens[tuned_axis] = 1;
+        shape batch_shape{shape::int32_type, batch_lens};
+
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = accumulator_type<typename decltype(input)::value_type>;
+            std::vector<value_type> batch_max(batch_shape.elements(),
+                                              std::numeric_limits<value_type>::lowest());
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = batch_shape.multi(i);
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[tuned_axis] = j;
+                    batch_max[i] =
+                        std::max<value_type>(batch_max[i], input(idx.begin(), idx.end()));
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[tuned_axis]   = j;
+                    std::size_t index = dyn_out.computed_shape.index(idx);
+                    output[index]     = std::exp(input[index] - batch_max[i]);
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[tuned_axis] = j;
+                    batch_sum[i] += output(idx.begin(), idx.end());
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[tuned_axis] = j;
+                    output(idx.begin(), idx.end()) =
+                        op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
+                }
+            });
+        });
+
+        return result;
+    }
+};
+
+struct ref_rnn_var_sl_last_output
+{
+    op::rnn_var_sl_last_output op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "ref::rnn_var_sl_last_output"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        return op.compute_shape(std::move(inputs));
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto out_comp_lens = args[0].get_shape().lens();
+        out_comp_lens[0]   = 1;
+        shape out_comp_s{output_shape.type(), out_comp_lens};
+
+        visit_all(result, args[0])([&](auto output, auto input) {
+            args[1].visit([&](auto seq_lens) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto idx = out_comp_s.multi(i);
+                    auto b   = idx[2];
+                    if(op.direction == op::rnn_direction::reverse or idx[1] == 1)
+                    {
+                        idx[0] = 0;
+                    }
+                    else
+                    {
+                        idx[0] = seq_lens[b] - 1;
+                    }
+                    output[i] = input(idx.begin(), idx.end());
+                });
+            });
+        });
+
+        return result;
+    }
+};
+MIGRAPHX_REGISTER_OP(ref_rnn_var_sl_last_output)
+
+struct ref_apply
+{
+    module* mod;
+    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
+
+    template <class T>
+    auto simple_op()
+    {
+        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
+    }
+
+    template <class T, class Op>
+    auto extend_op()
+    {
+        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
+    }
+
+    void init()
+    {
+        apply_map["dot"]        = extend_op<ref_gemm, op::dot>();
+        apply_map["quant_dot"]  = extend_op<ref_quant_gemm, op::quant_dot>();
+        apply_map["im2col"]     = extend_op<ref_im2col, op::im2col>();
+        apply_map["logsoftmax"] = extend_op<ref_softmax<op::logsoftmax>, op::logsoftmax>();
+        apply_map["lrn"]        = extend_op<ref_lrn, op::lrn>();
+        apply_map["pad"]        = extend_op<ref_pad, op::pad>();
+        apply_map["softmax"]    = extend_op<ref_softmax<op::softmax>, op::softmax>();
+        apply_map["rnn_var_sl_last_output"] =
+            extend_op<ref_rnn_var_sl_last_output, op::rnn_var_sl_last_output>();
+    }
+
+    void apply()
+    {
+        init();
+        for(auto it : iterator_for(*mod))
+        {
+            if(apply_map.count(it->name()) > 0)
+            {
+                apply_map.at(it->name())(it);
+            }
+            else if(is_context_free(it->get_operator()))
+            {
+                apply_ref_op(it);
+            }
+        }
+    }
+
+    void apply_ref_op(instruction_ref ins) const
+    {
+        mod->replace_instruction(ins, ref_op{ins->get_operator()}, ins->inputs());
+    }
+
+    template <class T>
+    void apply_simple_op(instruction_ref ins)
+    {
+        mod->replace_instruction(ins, T{}, ins->inputs());
+    }
+
+    template <class T, class Op>
+    void apply_extend_op(instruction_ref ins)
+    {
+        auto&& op = any_cast<Op>(ins->get_operator());
+        mod->replace_instruction(ins, T{op}, ins->inputs());
+    }
+};
+
+void lowering::apply(module& m) const { ref_apply{&m}.apply(); }
+
+} // namespace ref
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/targets/ref/target.cpp b/docker/rocm/migraphx/targets/ref/target.cpp
new file mode 100644
index 000000000..13c15e541
--- /dev/null
+++ b/docker/rocm/migraphx/targets/ref/target.cpp
@@ -0,0 +1,66 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/ref/target.hpp>
+#include <migraphx/ref/lowering.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/pass.hpp>
+#include <migraphx/auto_contiguous.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/eliminate_convert.hpp>
+#include <migraphx/eliminate_pad.hpp>
+#include <migraphx/insert_pad.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/normalize_ops.hpp>
+#include <migraphx/eliminate_data_type.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace ref {
+
+std::string target::name() const { return "ref"; }
+
+std::vector<pass> target::get_passes(migraphx::context&, const compile_options&) const
+{
+    return {normalize_ops{},
+            eliminate_pad{},
+            dead_code_elimination{},
+            insert_pad{},
+            dead_code_elimination{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            auto_contiguous{},
+            dead_code_elimination{},
+            lowering{},
+            dead_code_elimination{}};
+}
+
+argument target::allocate(const shape& s) const { return fill_argument(s, 0); }
+
+MIGRAPHX_REGISTER_TARGET(target);
+
+} // namespace ref
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/CMakeLists.txt b/docker/rocm/migraphx/tf/CMakeLists.txt
new file mode 100644
index 000000000..49df6d39d
--- /dev/null
+++ b/docker/rocm/migraphx/tf/CMakeLists.txt
@@ -0,0 +1,66 @@
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+find_package(Protobuf REQUIRED)
+
+protobuf_generate_cpp(
+    PROTO_SRCS PROTO_HDRS 
+    graph.proto
+    node_def.proto
+    attr_value.proto
+    tensor.proto
+    tensor_shape.proto
+    resource_handle.proto
+    types.proto
+    function.proto
+    op_def.proto
+    versions.proto
+)
+add_library(tf-proto STATIC ${PROTO_SRCS})
+target_include_directories(tf-proto SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${PROTOBUF_INCLUDE_DIR})
+if(MSVC)
+    target_compile_options(tf-proto PRIVATE /w)
+else()
+    target_compile_options(tf-proto PRIVATE -w)
+endif()
+target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
+set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)
+
+file(GLOB TF_SRCS CONFIGURE_DEPENDS *.cpp)
+add_library(migraphx_tf ${TF_SRCS})
+migraphx_generate_export_header(migraphx_tf)
+target_include_directories(migraphx_tf PRIVATE include)
+set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)
+rocm_set_soversion(migraphx_tf ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_tf)
+target_link_libraries(migraphx_tf PRIVATE tf-proto)
+if(NOT WIN32)
+    target_link_libraries(migraphx_tf PRIVATE "-Wl,--exclude-libs,ALL")
+endif()
+target_link_libraries(migraphx_tf PUBLIC migraphx)
+
+rocm_install_targets(
+  PRIVATE
+  TARGETS migraphx_tf
+)
+
diff --git a/docker/rocm/migraphx/tf/attr_value.proto b/docker/rocm/migraphx/tf/attr_value.proto
new file mode 100644
index 000000000..76944f77b
--- /dev/null
+++ b/docker/rocm/migraphx/tf/attr_value.proto
@@ -0,0 +1,62 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "AttrValueProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "tensor.proto";
+import "tensor_shape.proto";
+import "types.proto";
+
+// Protocol buffer representing the value for an attr used to configure an Op.
+// Comment indicates the corresponding attr type.  Only the field matching the
+// attr type may be filled.
+message AttrValue {
+  // LINT.IfChange
+  message ListValue {
+    repeated bytes s = 2;                        // "list(string)"
+    repeated int64 i = 3 [packed = true];        // "list(int)"
+    repeated float f = 4 [packed = true];        // "list(float)"
+    repeated bool b = 5 [packed = true];         // "list(bool)"
+    repeated DataType type = 6 [packed = true];  // "list(type)"
+    repeated TensorShapeProto shape = 7;         // "list(shape)"
+    repeated TensorProto tensor = 8;             // "list(tensor)"
+    repeated NameAttrList func = 9;              // "list(attr)"
+  }
+  // LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/c/c_api.cc)
+
+  oneof value {
+    bytes s = 2;                 // "string"
+    int64 i = 3;                 // "int"
+    float f = 4;                 // "float"
+    bool b = 5;                  // "bool"
+    DataType type = 6;           // "type"
+    TensorShapeProto shape = 7;  // "shape"
+    TensorProto tensor = 8;      // "tensor"
+    ListValue list = 1;          // any "list(...)"
+
+    // "func" represents a function. func.name is a function's name or
+    // a primitive op's name. func.attr.first is the name of an attr
+    // defined for that function. func.attr.second is the value for
+    // that attr in the instantiation.
+    NameAttrList func = 10;
+
+    // This is a placeholder only used in nodes defined inside a
+    // function.  It indicates the attr value will be supplied when
+    // the function is instantiated.  For example, let us suppose a
+    // node "N" in function "FN". "N" has an attr "A" with value
+    // placeholder = "foo". When FN is instantiated with attr "foo"
+    // set to "bar", the instantiated node N's attr A will have been
+    // given the value "bar".
+    string placeholder = 9;
+  }
+}
+
+// A list of attr names and their values. The whole list is attached
+// with a string name.  E.g., MatMul[T=float].
+message NameAttrList {
+  string name = 1;
+  map<string, AttrValue> attr = 2;
+}
diff --git a/docker/rocm/migraphx/tf/function.proto b/docker/rocm/migraphx/tf/function.proto
new file mode 100644
index 000000000..ce7f8d60e
--- /dev/null
+++ b/docker/rocm/migraphx/tf/function.proto
@@ -0,0 +1,102 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "FunctionProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "attr_value.proto";
+import "node_def.proto";
+import "op_def.proto";
+
+// A library is a set of named functions.
+message FunctionDefLibrary {
+  repeated FunctionDef function = 1;
+  repeated GradientDef gradient = 2;
+}
+
+// A function can be instantiated when the runtime can bind every attr
+// with a value. When a GraphDef has a call to a function, it must
+// have binding for every attr defined in the signature.
+//
+// TODO(zhifengc):
+//   * device spec, etc.
+message FunctionDef {
+  // The definition of the function's name, arguments, return values,
+  // attrs etc.
+  OpDef signature = 1;
+
+  // Attributes specific to this function definition.
+  map<string, AttrValue> attr = 5;
+
+  // NOTE: field id 2 deleted on Jan 11, 2017, GraphDef version 21.
+  reserved 2;
+
+  // In both of the following fields, there is the need to specify an
+  // output that is used as either the input to another node (in
+  // `node_def`) or as a return value of the function (in `ret`).
+  // Unlike the NodeDefs in GraphDef, we need to be able to specify a
+  // list in some cases (instead of just single outputs).  Also, we
+  // need to be able to deal with lists of unknown length (so the
+  // output index may not be known at function definition time).  So
+  // we use the following format instead:
+  // * "fun_in" where "fun_in" is the name of a function input arg in
+  //   the `signature` field above.  This represents that input, whether
+  //   it is a single tensor or a list.
+  // * "fun_in:0" gives the first element of a function input arg (a
+  //   non-list input is considered a list of length 1 for these
+  //   purposes).
+  // * "node:out" where "node" is the name of a node in `node_def` and
+  //   "out" is the name one of its op's output arguments (the name
+  //   comes from the OpDef of the node's op). This represents that
+  //   node's output, whether it is a single tensor or a list.
+  //   Note: We enforce that an op's output arguments are never
+  //   renamed in the backwards-compatibility test.
+  // * "node:out:0" gives the first element of a node output arg (a
+  //   non-list output is considered a list of length 1 for these
+  //   purposes).
+  //
+  // NOT CURRENTLY SUPPORTED (but may be in the future):
+  // * "node:out:-1" gives last element in a node output list
+  // * "node:out:1:" gives a list with all but the first element in a
+  //   node output list
+  // * "node:out::-1" gives a list with all but the last element in a
+  //   node output list
+
+  // The body of the function.  Unlike the NodeDefs in a GraphDef, attrs
+  // may have values of type `placeholder` and the `input` field uses
+  // the "output" format above.
+
+  // By convention, "op" in node_def is resolved by consulting with a
+  // user-defined library first. If not resolved, "func" is assumed to
+  // be a builtin op.
+  repeated NodeDef node_def = 3;
+
+  // A mapping from the output arg names from `signature` to the
+  // outputs from `node_def` that should be returned by the function.
+  map<string, string> ret = 4;
+}
+
+// GradientDef defines the gradient function of a function defined in
+// a function library.
+//
+// A gradient function g (specified by gradient_func) for a function f
+// (specified by function_name) must follow the following:
+//
+// The function 'f' must be a numerical function which takes N inputs
+// and produces M outputs. Its gradient function 'g', which is a
+// function taking N + M inputs and produces N outputs.
+//
+// I.e. if we have
+//    (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),
+// then, g is
+//    (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,
+//                                      dL/dy1, dL/dy2, ..., dL/dy_M),
+// where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+// loss function). dL/dx_i is the partial derivative of L with respect
+// to x_i.
+message GradientDef {
+  string function_name = 1;  // The function name.
+  string gradient_func = 2;  // The gradient function's name.
+}
diff --git a/docker/rocm/migraphx/tf/graph.proto b/docker/rocm/migraphx/tf/graph.proto
new file mode 100644
index 000000000..14d9edfab
--- /dev/null
+++ b/docker/rocm/migraphx/tf/graph.proto
@@ -0,0 +1,56 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "GraphProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "node_def.proto";
+import "function.proto";
+import "versions.proto";
+
+// Represents the graph of operations
+message GraphDef {
+  repeated NodeDef node = 1;
+
+  // Compatibility versions of the graph.  See core/public/version.h for version
+  // history.  The GraphDef version is distinct from the TensorFlow version, and
+  // each release of TensorFlow will support a range of GraphDef versions.
+  VersionDef versions = 4;
+
+  // Deprecated single version field; use versions above instead.  Since all
+  // GraphDef changes before "versions" was introduced were forward
+  // compatible, this field is entirely ignored.
+  int32 version = 3 [deprecated = true];
+
+  // EXPERIMENTAL. DO NOT USE OR DEPEND ON THIS YET.
+  //
+  // "library" provides user-defined functions.
+  //
+  // Naming:
+  //   * library.function.name are in a flat namespace.
+  //     NOTE: We may need to change it to be hierarchical to support
+  //     different orgs. E.g.,
+  //     { "/google/nn", { ... }},
+  //     { "/google/vision", { ... }}
+  //     { "/org_foo/module_bar", { ... }}
+  //     map<string, FunctionDefLib> named_lib;
+  //   * If node[i].op is the name of one function in "library",
+  //     node[i] is deemed as a function call. Otherwise, node[i].op
+  //     must be a primitive operation supported by the runtime.
+  //
+  //
+  // Function call semantics:
+  //
+  //   * The callee may start execution as soon as some of its inputs
+  //     are ready. The caller may want to use Tuple() mechanism to
+  //     ensure all inputs are ready in the same time.
+  //
+  //   * The consumer of return values may start executing as soon as
+  //     the return values the consumer depends on are ready.  The
+  //     consumer may want to use Tuple() mechanism to ensure the
+  //     consumer does not start until all return values of the callee
+  //     function are ready.
+  FunctionDefLibrary library = 2;
+};
diff --git a/docker/rocm/migraphx/tf/include/migraphx/tf/op_parser.hpp b/docker/rocm/migraphx/tf/include/migraphx/tf/op_parser.hpp
new file mode 100644
index 000000000..7ac7af501
--- /dev/null
+++ b/docker/rocm/migraphx/tf/include/migraphx/tf/op_parser.hpp
@@ -0,0 +1,102 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_TF_REGISTER_OP_PARSER_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_TF_REGISTER_OP_PARSER_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/auto_register.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <cstring>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct op_desc
+{
+    std::string tf_name = "";
+    std::string op_name = "";
+};
+
+void register_op_parser(const std::string& name, tf_parser::op_func f);
+tf_parser::op_func get_op_parser(const std::string& name);
+std::vector<std::string> get_op_parsers();
+
+inline std::vector<instruction_ref> implicit_multi_op(std::vector<instruction_ref> inss)
+{
+    return inss;
+}
+
+inline std::vector<instruction_ref> implicit_multi_op(instruction_ref ins) { return {ins}; }
+
+template <class T>
+void register_op_parser()
+{
+    T parser;
+    for(auto&& opd : parser.operators())
+        register_op_parser(opd.tf_name,
+                           [opd, parser](auto&&... xs) { return parser.base_parse(opd, xs...); });
+}
+
+struct register_op_parser_action
+{
+    template <class T>
+    static void apply()
+    {
+        register_op_parser<T>();
+    }
+};
+
+template <class Derived>
+struct op_parser : auto_register<register_op_parser_action, Derived>
+{
+    bool transpose() const { return false; }
+    std::vector<instruction_ref> base_parse(const op_desc& opd,
+                                            const tf_parser& parser,
+                                            tf_parser::node_info info,
+                                            const std::vector<instruction_ref>& args) const
+    {
+        std::vector<instruction_ref> result;
+        auto& self = static_cast<const Derived&>(*this);
+        if(self.transpose())
+        {
+            result = implicit_multi_op(self.parse(opd, parser, info, parser.to_nchw(args)));
+            std::transform(result.begin(), result.end(), result.begin(), [&](auto ins) {
+                return parser.to_nhwc(ins);
+            });
+        }
+        else
+        {
+            result = implicit_multi_op(self.parse(opd, parser, info, args));
+        }
+        return result;
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/tf/include/migraphx/tf/tf_parser.hpp b/docker/rocm/migraphx/tf/include/migraphx/tf/tf_parser.hpp
new file mode 100644
index 000000000..99510512e
--- /dev/null
+++ b/docker/rocm/migraphx/tf/include/migraphx/tf/tf_parser.hpp
@@ -0,0 +1,141 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_TF_PARSER_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_TF_PARSER_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/program.hpp>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <graph.pb.h>
+#include <unordered_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+// namespace tf = tf_for_migraphx;
+
+struct tf_parser
+{
+    std::string filename;
+    std::string path    = ".";
+    using attribute_map = std::unordered_map<std::string, tensorflow::AttrValue>;
+    struct node_info
+    {
+        attribute_map attributes{};
+        std::string name = "";
+        module* mm       = nullptr;
+
+        instruction_ref make_contiguous(instruction_ref ins) const;
+
+        instruction_ref add_broadcastable_binary_op(const std::string& op_name,
+                                                    instruction_ref arg0,
+                                                    instruction_ref arg1) const;
+
+        instruction_ref add_common_op(const std::string& op_name,
+                                      std::vector<instruction_ref> inputs) const;
+
+        template <class... Ts>
+        instruction_ref add_common_op(const std::string& op_name, Ts... xs) const
+        {
+            return add_common_op(op_name, {xs...});
+        }
+
+        instruction_ref add_instruction(const operation& op,
+                                        const std::vector<instruction_ref>& args) const;
+
+        template <class... Ts>
+        instruction_ref add_instruction(const operation& op, Ts... xs) const
+        {
+            return add_instruction(op, {xs...});
+        }
+        instruction_ref add_literal(literal l) const;
+        template <class... Ts>
+        instruction_ref add_literal(Ts&&... xs) const
+        {
+            return add_literal(literal{std::forward<Ts>(xs)...});
+        }
+    };
+
+    using node_map = std::map<std::string, tensorflow::NodeDef>;
+    using op_func  = std::function<std::vector<instruction_ref>(
+        const tf_parser&, const node_info&, std::vector<instruction_ref>)>;
+    node_map nodes;
+    std::vector<tensorflow::NodeDef> input_nodes;
+    std::vector<std::string> output_node_names;
+    std::unordered_map<std::string, instruction_ref> instructions;
+    program prog                  = program();
+    module* mm                    = prog.get_main_module();
+    bool is_nhwc                  = true;
+    unsigned int batch_size       = 1;
+    std::size_t default_dim_value = 1;
+    std::unordered_map<std::string, std::vector<std::size_t>> map_input_dims;
+
+    std::unordered_map<std::string, op_func> ops;
+
+    tf_parser();
+    operation load(const std::string& name, const node_info& info) const;
+    bool should_transpose(instruction_ref ins) const;
+    instruction_ref to_nhwc(instruction_ref ins) const;
+    instruction_ref to_nchw(instruction_ref ins) const;
+    instruction_ref to_kcxy(instruction_ref ins) const;
+    std::vector<instruction_ref> to_nchw(const std::vector<instruction_ref>& args) const;
+    std::vector<instruction_ref> to_nhwc(const std::vector<instruction_ref>& args) const;
+    int64_t parse_axis(int64_t dim, size_t num_dims) const;
+    // tf stores certain attributes such as strides, dilations, as a 4D input.
+    // The first and last dims are equal to 1, and the relevant data is in dims 2 and 3.
+    // This helper function reorders the data to store for the respective operator member variables.
+    template <class T>
+    void reorder_data(std::vector<T>& prev_data) const
+    {
+        std::vector<T> new_data(prev_data.size());
+        for(size_t i = 0; i < new_data.size(); i++)
+        {
+            auto new_idx         = parse_axis(i, new_data.size());
+            new_data.at(new_idx) = prev_data.at(i);
+        }
+        prev_data = new_data;
+    }
+
+    void parse_undefined(module* mm, const std::string& name);
+    void parse_from(std::istream& is);
+    void parse_from(const void* data, std::size_t size);
+    void parse_graph(const tensorflow::GraphDef& graph);
+    void parse_node(const std::string& name);
+    literal parse_tensor(const tensorflow::TensorProto& t) const;
+    shape::type_t parse_type(tensorflow::DataType t) const;
+    std::vector<std::string> find_outputs() const;
+};
+
+std::vector<int64_t> get_axes_from_mask(size_t num_axes, uint32_t mask);
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/docker/rocm/migraphx/tf/node_def.proto b/docker/rocm/migraphx/tf/node_def.proto
new file mode 100644
index 000000000..a79c0acd7
--- /dev/null
+++ b/docker/rocm/migraphx/tf/node_def.proto
@@ -0,0 +1,63 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "NodeProto";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "attr_value.proto";
+
+message NodeDef {
+  // The name given to this operator. Used for naming inputs,
+  // logging, visualization, etc.  Unique within a single GraphDef.
+  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_./]*".
+  string name = 1;
+
+  // The operation name.  There may be custom parameters in attrs.
+  // Op names starting with an underscore are reserved for internal use.
+  string op = 2;
+
+  // Each input is "node:src_output" with "node" being a string name and
+  // "src_output" indicating which output tensor to use from "node". If
+  // "src_output" is 0 the ":0" suffix can be omitted.  Regular inputs
+  // may optionally be followed by control inputs that have the format
+  // "^node".
+  repeated string input = 3;
+
+  // A (possibly partial) specification for the device on which this
+  // node should be placed.
+  // The expected syntax for this string is as follows:
+  //
+  // DEVICE_SPEC ::= PARTIAL_SPEC
+  //
+  // PARTIAL_SPEC ::= ("/" CONSTRAINT) *
+  // CONSTRAINT ::= ("job:" JOB_NAME)
+  //              | ("replica:" [1-9][0-9]*)
+  //              | ("task:" [1-9][0-9]*)
+  //              | ("device:" [A-Za-z]* ":" ([1-9][0-9]* | "*") )
+  //
+  // Valid values for this string include:
+  // * "/job:worker/replica:0/task:1/device:GPU:3"  (full specification)
+  // * "/job:worker/device:GPU:3"                   (partial specification)
+  // * ""                                    (no specification)
+  //
+  // If the constraints do not resolve to a single device (or if this
+  // field is empty or not present), the runtime will attempt to
+  // choose a device automatically.
+  string device = 4;
+
+  // Operation-specific graph-construction-time configuration.
+  // Note that this should include all attrs defined in the
+  // corresponding OpDef, including those with a value matching
+  // the default -- this allows the default to change and makes
+  // NodeDefs easier to interpret on their own.  However, if
+  // an attr with a default is not specified in this list, the
+  // default will be used.
+  // The "names" (keys) must match the regexp "[a-z][a-z0-9_]+" (and
+  // one of the names from the corresponding OpDef's attr field).
+  // The values must have a type matching the corresponding OpDef
+  // attr's type field.
+  // TODO(josh11b): Add some examples here showing best practices.
+  map<string, AttrValue> attr = 5;
+};
diff --git a/docker/rocm/migraphx/tf/op_def.proto b/docker/rocm/migraphx/tf/op_def.proto
new file mode 100644
index 000000000..86bea899a
--- /dev/null
+++ b/docker/rocm/migraphx/tf/op_def.proto
@@ -0,0 +1,166 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "OpDefProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "attr_value.proto";
+import "types.proto";
+
+// Defines an operation. A NodeDef in a GraphDef specifies an Op by
+// using the "op" field which should match the name of a OpDef.
+// LINT.IfChange
+message OpDef {
+  // Op names starting with an underscore are reserved for internal use.
+  // Names should be CamelCase and match the regexp "[A-Z][a-zA-Z0-9_]*".
+  string name = 1;
+
+  // For describing inputs and outputs.
+  message ArgDef {
+    // Name for the input/output.  Should match the regexp "[a-z][a-z0-9_]*".
+    string name = 1;
+
+    // Human readable description.
+    string description = 2;
+
+    // Describes the type of one or more tensors that are accepted/produced
+    // by this input/output arg.  The only legal combinations are:
+    // * For a single tensor: either the "type" field is set or the
+    //   "type_attr" field is set to the name of an attr with type "type".
+    // * For a sequence of tensors with the same type: the "number_attr"
+    //   field will be set to the name of an attr with type "int", and
+    //   either the "type" or "type_attr" field will be set as for
+    //   single tensors.
+    // * For a sequence of tensors, the "type_list_attr" field will be set
+    //   to the name of an attr with type "list(type)".
+    DataType type = 3;
+    string type_attr = 4;    // if specified, attr must have type "type"
+    string number_attr = 5;  // if specified, attr must have type "int"
+    // If specified, attr must have type "list(type)", and none of
+    // type, type_attr, and number_attr may be specified.
+    string type_list_attr = 6;
+
+    // For inputs: if true, the inputs are required to be refs.
+    //   By default, inputs can be either refs or non-refs.
+    // For outputs: if true, outputs are refs, otherwise they are not.
+    bool is_ref = 16;
+  };
+
+  // Description of the input(s).
+  repeated ArgDef input_arg = 2;
+
+  // Description of the output(s).
+  repeated ArgDef output_arg = 3;
+
+  // Description of the graph-construction-time configuration of this
+  // Op.  That is to say, this describes the attr fields that will
+  // be specified in the NodeDef.
+  message AttrDef {
+    // A descriptive name for the argument.  May be used, e.g. by the
+    // Python client, as a keyword argument name, and so should match
+    // the regexp "[a-z][a-z0-9_]+".
+    string name = 1;
+
+    // One of the type names from attr_value.proto ("string", "list(string)",
+    // "int", etc.).
+    string type = 2;
+
+    // A reasonable default for this attribute if the user does not supply
+    // a value.  If not specified, the user must supply a value.
+    AttrValue default_value = 3;
+
+    // Human-readable description.
+    string description = 4;
+
+    // TODO(josh11b): bool is_optional?
+
+    // --- Constraints ---
+    // These constraints are only in effect if specified.  Default is no
+    // constraints.
+
+    // For type == "int", this is a minimum value.  For "list(___)"
+    // types, this is the minimum length.
+    bool has_minimum = 5;
+    int64 minimum = 6;
+
+    // The set of allowed values.  Has type that is the "list" version
+    // of the "type" field above (uses the "list" field of AttrValue).
+    // If type == "type" or "list(type)" above, then the "type" field
+    // of "allowed_values.list" has the set of allowed DataTypes.
+    // If type == "string" or "list(string)", then the "s" field of
+    // "allowed_values.list" has the set of allowed strings.
+    AttrValue allowed_values = 7;
+  }
+  repeated AttrDef attr = 4;
+
+  // Optional deprecation based on GraphDef versions.
+  OpDeprecation deprecation = 8;
+
+  // One-line human-readable description of what the Op does.
+  string summary = 5;
+
+  // Additional, longer human-readable description of what the Op does.
+  string description = 6;
+
+  // -------------------------------------------------------------------------
+  // Which optimizations this operation can participate in.
+
+  // True if the operation is commutative ("op(a,b) == op(b,a)" for all inputs)
+  bool is_commutative = 18;
+
+  // If is_aggregate is true, then this operation accepts N >= 2
+  // inputs and produces 1 output all of the same type.  Should be
+  // associative and commutative, and produce output with the same
+  // shape as the input.  The optimizer may replace an aggregate op
+  // taking input from multiple devices with a tree of aggregate ops
+  // that aggregate locally within each device (and possibly within
+  // groups of nearby devices) before communicating.
+  // TODO(josh11b): Implement that optimization.
+  bool is_aggregate = 16;  // for things like add
+
+  // Other optimizations go here, like
+  //   can_alias_input, rewrite_when_output_unused, partitioning_strategy, etc.
+
+  // -------------------------------------------------------------------------
+  // Optimization constraints.
+
+  // Ops are marked as stateful if their behavior depends on some state beyond
+  // their input tensors (e.g. variable reading op) or if they have
+  // a side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+  // must always produce the same output for the same input and have
+  // no side-effects.
+  //
+  // By default Ops may be moved between devices.  Stateful ops should
+  // either not be moved, or should only be moved if that state can also
+  // be moved (e.g. via some sort of save / restore).
+  // Stateful ops are guaranteed to never be optimized away by Common
+  // Subexpression Elimination (CSE).
+  bool is_stateful = 17;  // for things like variables, queue
+
+  // -------------------------------------------------------------------------
+  // Non-standard options.
+
+  // By default, all inputs to an Op must be initialized Tensors.  Ops
+  // that may initialize tensors for the first time should set this
+  // field to true, to allow the Op to take an uninitialized Tensor as
+  // input.
+  bool allows_uninitialized_input = 19;  // for Assign, etc.
+};
+// LINT.ThenChange(
+//     https://www.tensorflow.org/code/tensorflow/core/framework/op_def_util.cc)
+
+// Information about version-dependent deprecation of an op
+message OpDeprecation {
+  // First GraphDef version at which the op is disallowed.
+  int32 version = 1;
+
+  // Explanation of why it was deprecated and what to use instead.
+  string explanation = 2;
+};
+
+// A collection of OpDefs
+message OpList {
+  repeated OpDef op = 1;
+};
diff --git a/docker/rocm/migraphx/tf/op_parser.cpp b/docker/rocm/migraphx/tf/op_parser.cpp
new file mode 100644
index 000000000..9a1b25c9b
--- /dev/null
+++ b/docker/rocm/migraphx/tf/op_parser.cpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+std::unordered_map<std::string, tf_parser::op_func>& op_parser_map()
+{
+    static std::unordered_map<std::string, tf_parser::op_func> m; // NOLINT
+    return m;
+}
+
+void register_op_parser(const std::string& name, tf_parser::op_func f)
+{
+    op_parser_map()[name] = std::move(f);
+}
+tf_parser::op_func get_op_parser(const std::string& name) { return op_parser_map().at(name); }
+std::vector<std::string> get_op_parsers()
+{
+    std::vector<std::string> result;
+    std::transform(op_parser_map().begin(),
+                   op_parser_map().end(),
+                   std::back_inserter(result),
+                   [&](auto&& p) { return p.first; });
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_arg_op.cpp b/docker/rocm/migraphx/tf/parse_arg_op.cpp
new file mode 100644
index 000000000..403382325
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_arg_op.cpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_arg_op : op_parser<parse_arg_op>
+{
+    std::vector<op_desc> operators() const { return {{"ArgMax", "argmax"}, {"ArgMin", "argmin"}}; }
+
+    instruction_ref parse(const op_desc& opd,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        int64_t axis = 0;
+        axis         = args[1]->eval().at<int64_t>();
+        auto ins     = info.add_instruction(make_op(opd.op_name, {{"axis", axis}}), args.front());
+        return info.add_instruction(make_op("squeeze", {{"axes", {axis}}}), ins);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_batchnorm.cpp b/docker/rocm/migraphx/tf/parse_batchnorm.cpp
new file mode 100644
index 000000000..4c7772eca
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_batchnorm.cpp
@@ -0,0 +1,77 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_batchnorm : op_parser<parse_batchnorm>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"FusedBatchNorm"}, {"FusedBatchNormV3"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        // different default epsilon than from ONNX
+        float epsilon = 1e-4f;
+        if(contains(info.attributes, "epsilon"))
+        {
+            epsilon = info.attributes.at("epsilon").f();
+        }
+
+        auto x_lens = args[0]->get_shape().lens();
+        auto x_type = args[0]->get_shape().type();
+
+        // unsqueeze tensors of shape (C) to broadcast correctly
+        auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+
+        auto scale_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[1]);
+        auto bias_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[2]);
+        auto mean_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[3]);
+        auto var_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);
+
+        auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+        auto var_eps    = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+        auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto mul0       = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
+        auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
+        return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_biasadd.cpp b/docker/rocm/migraphx/tf/parse_biasadd.cpp
new file mode 100644
index 000000000..3ecdf42cc
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_biasadd.cpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_biasadd : op_parser<parse_biasadd>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"BiasAdd"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        uint64_t axis = 1; // assume output of previous layer is in NCHW (broadcast on channel)
+
+        auto l0 = info.add_instruction(
+            make_op("broadcast", {{"axis", axis}, {"out_lens", args[0]->get_shape().lens()}}),
+            args[1]);
+        return info.add_instruction(make_op("add"), args[0], l0);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_binary_op.cpp b/docker/rocm/migraphx/tf/parse_binary_op.cpp
new file mode 100644
index 000000000..0aa30f765
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_binary_op.cpp
@@ -0,0 +1,59 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_binary_op : op_parser<parse_binary_op>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const
+    {
+        return {{"Add", "add"},
+                {"AddV2", "add"},
+                {"Mul", "mul"},
+                {"Pow", "pow"},
+                {"SquaredDifference", "sqdiff"},
+                {"Sub", "sub"}};
+    }
+
+    instruction_ref parse(const op_desc& opd,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        if(args.size() != 2)
+            MIGRAPHX_THROW("binary operators should have 2 operands");
+        return info.add_broadcastable_binary_op(opd.op_name, args[0], args[1]);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_cast.cpp b/docker/rocm/migraphx/tf/parse_cast.cpp
new file mode 100644
index 000000000..4bcd2905f
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_cast.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_cast : op_parser<parse_cast>
+{
+    std::vector<op_desc> operators() const { return {{"Cast"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        shape::type_t type = parser.parse_type(info.attributes.at("DstT").type());
+        return info.add_instruction(make_op("convert", {{"target_type", type}}), args);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_concat.cpp b/docker/rocm/migraphx/tf/parse_concat.cpp
new file mode 100644
index 000000000..5b0fbb124
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_concat.cpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_concat : op_parser<parse_concat>
+{
+    std::vector<op_desc> operators() const { return {{"ConcatV2"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        // get index for axis within args
+        size_t axis_idx = info.attributes.at("N").i();
+        int64_t axis    = args[axis_idx]->eval().at<int64_t>();
+        auto op         = make_op("concat", {{"axis", axis}});
+        // return only first N arguments (assuming last index is the axis value)
+        return info.add_instruction(
+            op, std::vector<instruction_ref>(args.begin(), args.begin() + args.size() - 1));
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_constant.cpp b/docker/rocm/migraphx/tf/parse_constant.cpp
new file mode 100644
index 000000000..5b1400b15
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_constant.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_constant_op : op_parser<parse_constant_op>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"Const"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          const std::vector<instruction_ref>& /*args*/) const
+    {
+        literal v = parser.parse_tensor(info.attributes.at("value").tensor());
+        return info.add_literal(v);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_conv.cpp b/docker/rocm/migraphx/tf/parse_conv.cpp
new file mode 100644
index 000000000..cd7b2302c
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_conv.cpp
@@ -0,0 +1,112 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/pad_calc.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_conv : op_parser<parse_conv>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"Conv2D"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        op::convolution op;
+        if(contains(info.attributes, "strides"))
+        {
+            std::vector<size_t> stride;
+            copy(info.attributes.at("strides").list().i(), std::back_inserter(stride));
+            parser.reorder_data(stride);
+            if(stride.size() != 4)
+            {
+                MIGRAPHX_THROW("strides should have 4 values");
+            }
+            op.stride[0] = stride[2];
+            op.stride[1] = stride[3];
+        }
+        if(contains(info.attributes, "dilations"))
+        {
+            std::vector<size_t> dilation;
+            copy(info.attributes.at("dilations").list().i(), std::back_inserter(dilation));
+            parser.reorder_data(dilation);
+            if(dilation.size() != 4)
+            {
+                MIGRAPHX_THROW("dilation should have 4 values");
+            }
+            op.dilation[0] = dilation[2];
+            op.dilation[1] = dilation[3];
+        }
+
+        auto weights = parser.to_kcxy(args[1]);
+        auto l0      = args[0];
+        if(contains(info.attributes, "padding"))
+        {
+            const std::string& pad_mode = info.attributes.at("padding").s();
+            if(pad_mode.find("SAME") != std::string::npos)
+            {
+                std::vector<size_t> weight_dims = weights->get_shape().lens();
+                size_t weight_h                 = weight_dims[2];
+                size_t weight_w                 = weight_dims[3];
+
+                auto input_dims = l0->get_shape().lens();
+                std::vector<int64_t> pads(input_dims.size());
+                calculate_padding(0, pads, input_dims[2], op.stride[0], op.dilation[0], weight_h);
+                calculate_padding(1, pads, input_dims[3], op.stride[1], op.dilation[1], weight_w);
+
+                op.padding = std::vector<size_t>(pads.begin(), pads.end());
+            }
+            else if(pad_mode.find("EXPLICIT") != std::string::npos)
+            {
+                std::vector<size_t> padding;
+                copy(info.attributes.at("explicit_paddings").list().i(),
+                     std::back_inserter(padding));
+                if(padding.size() != 4)
+                {
+                    MIGRAPHX_THROW("padding should have 4 values");
+                }
+                if(padding[0] != padding[2] or padding[1] != padding[3])
+                {
+                    MIGRAPHX_THROW("migraphx does not support asymetric padding");
+                }
+                op.padding[0] = padding[0];
+                op.padding[1] = padding[1];
+            }
+        }
+        return info.add_instruction(op, {l0, weights});
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_depthwiseconv.cpp b/docker/rocm/migraphx/tf/parse_depthwiseconv.cpp
new file mode 100644
index 000000000..7474654ad
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_depthwiseconv.cpp
@@ -0,0 +1,125 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/pad_calc.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_depthwiseconv : op_parser<parse_depthwiseconv>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"DepthwiseConv2dNative"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        op::convolution op;
+        size_t num_channels = args[0]->get_shape().lens()[1];
+        op.group            = num_channels;
+
+        if(contains(info.attributes, "strides"))
+        {
+            std::vector<size_t> stride;
+            copy(info.attributes.at("strides").list().i(), std::back_inserter(stride));
+            parser.reorder_data(stride);
+            if(stride.size() != 4)
+            {
+                MIGRAPHX_THROW("strides should have 4 values");
+            }
+            op.stride[0] = stride[2];
+            op.stride[1] = stride[3];
+        }
+
+        auto weights = parser.to_kcxy(args[1]);
+        if(contains(info.attributes, "dilations"))
+        {
+            std::vector<size_t> dilation;
+            copy(info.attributes.at("dilations").list().i(), std::back_inserter(dilation));
+            parser.reorder_data(dilation);
+            if(dilation.size() != 4)
+            {
+                MIGRAPHX_THROW("dilation should have 4 values");
+            }
+            op.dilation[0] = dilation[2];
+            op.dilation[1] = dilation[3];
+        }
+
+        auto l0 = args[0];
+        if(contains(info.attributes, "padding"))
+        {
+            const std::string& pad_mode = info.attributes.at("padding").s();
+
+            if(pad_mode.find("SAME") != std::string::npos)
+            {
+                std::vector<size_t> weight_dims = weights->get_shape().lens();
+                size_t weight_h                 = weight_dims[2];
+                size_t weight_w                 = weight_dims[3];
+
+                auto input_dims = l0->get_shape().lens();
+                std::vector<int64_t> pads(input_dims.size());
+                calculate_padding(0, pads, input_dims[2], op.stride[0], op.dilation[0], weight_h);
+                calculate_padding(1, pads, input_dims[3], op.stride[1], op.dilation[1], weight_w);
+
+                if(pads[0] != pads[2] or pads[1] != pads[3])
+                {
+                    std::vector<int64_t> padding = {0, 0, pads[0], pads[1], 0, 0, pads[2], pads[3]};
+                    l0 = info.add_instruction(migraphx::make_op("pad", {{"pads", padding}}), l0);
+                }
+                else
+                {
+                    op.padding[0] = pads[0];
+                    op.padding[1] = pads[1];
+                }
+            }
+        }
+
+        std::vector<int64_t> new_weights_shape;
+        copy(weights->get_shape().lens(), std::back_inserter(new_weights_shape));
+
+        // weight format is (out_channels, in_channels, h, w), but in depthwise_conv,
+        // out_channels is equal to the multiplier. Adjust by inserting a reshape and
+        // setting in_channels to 1
+        int64_t multiplier   = new_weights_shape[0];
+        int64_t out_channels = num_channels * multiplier;
+        new_weights_shape[0] = out_channels;
+        new_weights_shape[1] = 1;
+        // Make sure weights are contiguous before doing reshape
+        auto new_weights = info.add_instruction(make_op("reshape", {{"dims", new_weights_shape}}),
+                                                info.make_contiguous(weights));
+
+        return info.add_instruction(op, {l0, new_weights});
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_expanddims.cpp b/docker/rocm/migraphx/tf/parse_expanddims.cpp
new file mode 100644
index 000000000..db74eb9f6
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_expanddims.cpp
@@ -0,0 +1,62 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_expanddims : op_parser<parse_expanddims>
+{
+    std::vector<op_desc> operators() const { return {{"ExpandDims"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        std::vector<size_t> input_dims = args[0]->get_shape().lens();
+        std::vector<int64_t> new_dims(input_dims.begin(), input_dims.end());
+        size_t num_dims = input_dims.size();
+        int32_t dim     = args[1]->eval().at<int32_t>();
+
+        if(dim < 0)
+        {
+            new_dims.insert(new_dims.begin() + (num_dims + dim + 1), 1);
+        }
+        else
+        {
+            new_dims.insert(new_dims.begin() + dim, 1);
+        }
+        return info.add_instruction(make_op("reshape", {{"dims", new_dims}}), args[0]);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_gather.cpp b/docker/rocm/migraphx/tf/parse_gather.cpp
new file mode 100644
index 000000000..965b8fa83
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_gather.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_gather : op_parser<parse_gather>
+{
+    std::vector<op_desc> operators() const { return {{"GatherV2"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int axis = args[2]->eval().at<int32_t>();
+        return info.add_instruction(make_op("gather", {{"axis", axis}}), {args[0], args[1]});
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_generic_op.cpp b/docker/rocm/migraphx/tf/parse_generic_op.cpp
new file mode 100644
index 000000000..e459147fc
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_generic_op.cpp
@@ -0,0 +1,58 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_generic_op : op_parser<parse_generic_op>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const
+    {
+        return {{"All", "identity"},
+                {"Identity", "identity"},
+                {"LessEqual", "identity"},
+                {"Relu", "relu"},
+                {"Rsqrt", "rsqrt"},
+                {"Tanh", "tanh"},
+                {"StopGradient", "identity"}};
+    }
+
+    instruction_ref parse(const op_desc& opd,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        return info.add_instruction(make_op(opd.op_name), args);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_matmul.cpp b/docker/rocm/migraphx/tf/parse_matmul.cpp
new file mode 100644
index 000000000..7ca4c52b4
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_matmul.cpp
@@ -0,0 +1,85 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_matmul : op_parser<parse_matmul>
+{
+    std::vector<op_desc> operators() const
+    {
+        return {{"BatchMatMul"}, {"BatchMatMulV2"}, {"MatMul"}};
+    }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        bool transa = false;
+        bool transb = false;
+
+        if(contains(info.attributes, "transpose_a"))
+        {
+            transa = info.attributes.at("transpose_a").b();
+        }
+        if(contains(info.attributes, "transpose_b"))
+        {
+            transb = info.attributes.at("transpose_b").b();
+        }
+
+        if(contains(info.attributes, "adj_x"))
+        {
+            transa = info.attributes.at("adj_x").b();
+        }
+        if(contains(info.attributes, "adj_y"))
+        {
+            transb = info.attributes.at("adj_y").b();
+        }
+
+        std::vector<int64_t> perm(args[0]->get_shape().lens().size());
+        std::iota(perm.begin(), perm.end(), int64_t{0});
+        // swap the last two elements
+        std::iter_swap(perm.end() - 1, perm.end() - 2);
+
+        auto l1 = (transa)
+                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[0])
+                      : args[0];
+        auto l2 = (transb)
+                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[1])
+                      : args[1];
+
+        return info.add_instruction(make_op("dot"), l1, l2);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_mean.cpp b/docker/rocm/migraphx/tf/parse_mean.cpp
new file mode 100644
index 000000000..d0c2b9952
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_mean.cpp
@@ -0,0 +1,55 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_mean : op_parser<parse_mean>
+{
+    std::vector<op_desc> operators() const { return {{"Mean"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        bool keep_dims = info.attributes.at("keep_dims").b();
+        auto axes      = args[1]->eval().get<int32_t>().to_vector<int64_t>();
+
+        auto ins = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), args[0]);
+        if(not keep_dims)
+            ins = info.add_instruction(make_op("squeeze", {{"axes", axes}}), ins);
+        return ins;
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_onehot.cpp b/docker/rocm/migraphx/tf/parse_onehot.cpp
new file mode 100644
index 000000000..66f7d7d0a
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_onehot.cpp
@@ -0,0 +1,69 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_onehot : op_parser<parse_onehot>
+{
+    std::vector<op_desc> operators() const { return {{"OneHot"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        size_t depth = args[1]->eval().at<size_t>();
+
+        int64_t axis    = -1;
+        float on_value  = args[2]->eval().at<float>();
+        float off_value = args[3]->eval().at<float>();
+
+        std::vector<float> depth_input(depth * depth, off_value);
+        for(int i = 0; i < depth; i++)
+        {
+            depth_input[depth * i + i] = on_value;
+        }
+
+        if(contains(info.attributes, "axis"))
+            axis = info.attributes.at("axis").i();
+        if(axis == -1)
+        {
+            shape s{shape::float_type, {depth, depth}};
+            auto l0 = info.add_literal({s, depth_input});
+            return info.add_instruction(make_op("gather", {{"axis", 0}}), {l0, args[0]});
+        }
+        MIGRAPHX_THROW("MIGraphX does not support axis != -1");
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_pack.cpp b/docker/rocm/migraphx/tf/parse_pack.cpp
new file mode 100644
index 000000000..9766da1bc
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_pack.cpp
@@ -0,0 +1,70 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_pack : op_parser<parse_pack>
+{
+    std::vector<op_desc> operators() const { return {{"Pack"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        // reinterpret as unsqueeze with concat
+        std::vector<instruction_ref> unsqueezed_args;
+        int64_t axis = 0;
+        if(contains(info.attributes, "axis"))
+            axis = info.attributes.at("axis").i();
+        size_t input_size = args.front()->get_shape().lens().size();
+        if(axis > input_size)
+        {
+            MIGRAPHX_THROW("TF_PARSER: axis value of " + to_string(axis) +
+                           " must be smaller than input size " + to_string(input_size));
+        }
+
+        std::transform(
+            args.begin(),
+            args.end(),
+            std::back_inserter(unsqueezed_args),
+            [&](instruction_ref arg) {
+                return info.add_instruction(make_op("unsqueeze", {{"axes", {axis}}}), arg);
+            });
+        return parser.to_nhwc(
+            info.add_instruction(make_op("concat", {{"axis", axis}}), unsqueezed_args));
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_pad.cpp b/docker/rocm/migraphx/tf/parse_pad.cpp
new file mode 100644
index 000000000..b1d3a587d
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_pad.cpp
@@ -0,0 +1,69 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_pad : op_parser<parse_pad>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"Pad"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& parser,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        size_t ndims = args.front()->get_shape().lens().size();
+
+        // in tf, the paddings are arranged as a 2d shape (ndims, 2),
+        // the last dim contains the left padding and right padding respectively
+        std::vector<std::pair<int32_t, int32_t>> pad_per_dim(ndims);
+        auto tf_padding = args[1]->eval().get<int32_t>().to_vector();
+        for(size_t i = 0; i < 2 * ndims; i += 2)
+        {
+            pad_per_dim[i / 2].first  = tf_padding[i];
+            pad_per_dim[i / 2].second = tf_padding[i + 1];
+        }
+        parser.reorder_data(pad_per_dim);
+
+        std::vector<int64_t> pads(ndims * 2);
+        for(size_t i = 0; i < ndims; i++)
+        {
+            pads[i]         = pad_per_dim[i].first;
+            pads[i + ndims] = pad_per_dim[i].second;
+        }
+        return info.add_instruction(make_op("pad", {{"pads", pads}}), args.front());
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_pooling.cpp b/docker/rocm/migraphx/tf/parse_pooling.cpp
new file mode 100644
index 000000000..4baf09a86
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_pooling.cpp
@@ -0,0 +1,97 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/pad_calc.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_pooling : op_parser<parse_pooling>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"AvgPool"}, {"MaxPool"}}; }
+
+    instruction_ref parse(const op_desc& opd,
+                          const tf_parser& parser,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        if(not starts_with(opd.tf_name, "Max") and not starts_with(opd.tf_name, "Av"))
+        {
+            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
+        }
+        op::pooling op{starts_with(opd.tf_name, "Max") ? op::pooling_mode::max
+                                                       : op::pooling_mode::average};
+
+        if(contains(info.attributes, "strides"))
+        {
+            std::vector<size_t> stride;
+            copy(info.attributes.at("strides").list().i(), std::back_inserter(stride));
+            parser.reorder_data(stride);
+            if(stride.size() != 4)
+            {
+                MIGRAPHX_THROW("strides should have 4 values");
+            }
+            op.stride[0] = stride[2];
+            op.stride[1] = stride[3];
+        }
+        if(contains(info.attributes, "ksize"))
+        {
+            std::vector<size_t> ksize;
+            copy(info.attributes.at("ksize").list().i(), std::back_inserter(ksize));
+            parser.reorder_data(ksize);
+            if(ksize.size() != 4)
+            {
+                MIGRAPHX_THROW("ksize should have 4 values");
+            }
+            op.lengths[0] = ksize[2];
+            op.lengths[1] = ksize[3];
+        }
+
+        auto l0 = args[0];
+        if(contains(info.attributes, "padding"))
+        {
+            const std::string& pad_mode = info.attributes.at("padding").s();
+            if(pad_mode.find("SAME") != std::string::npos)
+            {
+                auto input_dims = l0->get_shape().lens();
+                std::vector<int64_t> pads(input_dims.size());
+                calculate_padding(0, pads, input_dims[2], op.stride[0], 1, op.lengths[0]);
+                calculate_padding(1, pads, input_dims[3], op.stride[1], 1, op.lengths[1]);
+
+                op.padding = std::vector<size_t>(pads.begin(), pads.end());
+            }
+        }
+        return info.add_instruction(op, l0);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_relu6.cpp b/docker/rocm/migraphx/tf/parse_relu6.cpp
new file mode 100644
index 000000000..75155b432
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_relu6.cpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_relu6 : op_parser<parse_relu6>
+{
+    bool transpose() const { return true; }
+    std::vector<op_desc> operators() const { return {{"Relu6"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        shape::type_t output_type = args[0]->get_shape().type();
+        auto min_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {0.0f}});
+        auto max_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {6.0f}});
+
+        return info.add_common_op("clip", args[0], min_val, max_val);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_reshape.cpp b/docker/rocm/migraphx/tf/parse_reshape.cpp
new file mode 100644
index 000000000..4a1c7e697
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_reshape.cpp
@@ -0,0 +1,54 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_reshape : op_parser<parse_reshape>
+{
+    std::vector<op_desc> operators() const { return {{"Reshape"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        if(args.size() != 2)
+            MIGRAPHX_THROW("reshape needs 2 arguments (input, new_shape)");
+        auto s = args[1]->eval();
+        std::vector<int64_t> dims;
+        s.visit([&](auto v) { copy(v, std::back_inserter(dims)); });
+        return info.add_instruction(make_op("reshape", {{"dims", dims}}), args[0]);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_shape.cpp b/docker/rocm/migraphx/tf/parse_shape.cpp
new file mode 100644
index 000000000..bdc850ef3
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_shape.cpp
@@ -0,0 +1,56 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_shape : op_parser<parse_shape>
+{
+    std::vector<op_desc> operators() const { return {{"Shape"}}; }
+
+    // Use a literal instruction to replace the shape since output of
+    // shape operator are literals in migraphx
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        std::vector<std::size_t> arg_shape = args[0]->get_shape().lens();
+        std::vector<int32_t> vec_shape(arg_shape.size());
+        migraphx::shape s(migraphx::shape::int32_type, {arg_shape.size()});
+        std::transform(
+            arg_shape.begin(), arg_shape.end(), vec_shape.begin(), [](auto i) { return i; });
+        return info.add_literal(migraphx::literal{s, vec_shape});
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_slice.cpp b/docker/rocm/migraphx/tf/parse_slice.cpp
new file mode 100644
index 000000000..4fd9be29c
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_slice.cpp
@@ -0,0 +1,69 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_slice : op_parser<parse_slice>
+{
+    std::vector<op_desc> operators() const { return {{"Slice"}}; }
+
+    // Use a literal instruction to replace the shape since output of
+    // shape operator are literals in migraphx
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto starts     = args[1]->eval().get<int32_t>().to_vector();
+        auto size       = args[2]->eval().get<int32_t>().to_vector();
+        auto axes       = args[0]->get_shape().lens();
+        size_t num_axes = axes.size();
+
+        std::vector<int64_t> axes_int64(axes.begin(), axes.end());
+        std::vector<int64_t> starts_int64(starts.begin(), starts.end());
+        std::vector<int64_t> ends(num_axes);
+        std::vector<int64_t> op_axes(num_axes);
+        std::iota(op_axes.begin(), op_axes.end(), 0);
+        for(size_t i = 0; i < num_axes; i++)
+        {
+            if(size[i] == -1)
+                ends[i] = axes_int64[i];
+            else
+                ends[i] = starts_int64[i] + size[i];
+        }
+        auto op = make_op("slice", {{"starts", starts_int64}, {"ends", ends}, {"axes", op_axes}});
+        return info.add_instruction(op, info.make_contiguous(args[0]));
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_softmax.cpp b/docker/rocm/migraphx/tf/parse_softmax.cpp
new file mode 100644
index 000000000..a136e5c6f
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_softmax.cpp
@@ -0,0 +1,60 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/tune_axis.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_softmax : op_parser<parse_softmax>
+{
+    std::vector<op_desc> operators() const { return {{"Softmax"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        int axis      = -1;
+        auto num_dims = args[0]->get_shape().lens().size();
+        if(contains(info.attributes, "axis"))
+        {
+            axis = static_cast<int>(info.attributes.at("axis").i());
+        }
+
+        axis = tune_axis(num_dims, axis, "tf_parse_softmax");
+
+        return info.add_instruction(make_op("softmax", {{"axis", axis}}),
+                                    info.make_contiguous(args[0]));
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_split.cpp b/docker/rocm/migraphx/tf/parse_split.cpp
new file mode 100644
index 000000000..91aa23936
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_split.cpp
@@ -0,0 +1,121 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_split : op_parser<parse_split>
+{
+    std::vector<op_desc> operators() const { return {{"Split"}, {"SplitV"}}; }
+
+    std::vector<instruction_ref> parse(const op_desc& /*opd*/,
+                                       const tf_parser& /*parser*/,
+                                       tf_parser::node_info info,
+                                       std::vector<instruction_ref> args) const
+    {
+        bool vector_as_input = args.size() == 3;
+        int num_outputs      = 1;
+        auto axis_arg        = args[0];
+        auto input_arg       = args[1];
+        if(vector_as_input)
+        {
+            input_arg = args[0];
+            axis_arg  = args[2];
+        }
+
+        if(contains(info.attributes, "num_split"))
+            num_outputs = info.attributes.at("num_split").i();
+
+        std::vector<int> splits(num_outputs);
+        std::vector<int> slice_pos{0};
+        if(vector_as_input)
+        {
+            splits      = args[1]->eval().get<int32_t>().to_vector();
+            num_outputs = splits.size();
+        }
+
+        assert(num_outputs > 0);
+
+        if(num_outputs == 1)
+            return std::vector<instruction_ref>{
+                info.add_instruction(make_op("identity"), input_arg)};
+
+        auto lens     = input_arg->get_shape().lens();
+        auto num_dims = lens.size();
+        int axis      = axis_arg->eval().at<int32_t>();
+
+        // ensure split is made evenly if "num_split" is used
+        assert(vector_as_input or lens[axis] % num_outputs == 0);
+
+        auto split_size = lens[axis] / num_outputs;
+
+        // push back first end point of slice
+        if(vector_as_input)
+        {
+            slice_pos.push_back(splits[0]);
+        }
+        else
+        {
+            slice_pos.push_back(split_size);
+        }
+
+        // calculate remaining end points for each slice
+        for(auto i = 1; i < num_outputs; i++)
+        {
+            if(vector_as_input)
+            {
+                splits[i] += splits[i - 1];
+                slice_pos.push_back(splits[i]);
+            }
+            else
+            {
+                slice_pos.push_back((i + 1) * split_size);
+            }
+        }
+        std::vector<instruction_ref> result;
+        for(auto i = 0; i < num_outputs; i++)
+        {
+            std::vector<int64_t> axes(num_dims);
+            std::iota(axes.begin(), axes.end(), 0);
+            std::vector<int64_t> starts(num_dims, 0);
+            std::vector<int64_t> ends(lens.begin(), lens.end());
+
+            starts[axis] = slice_pos[i];
+            ends[axis]   = slice_pos[i + 1];
+            auto op      = make_op("slice", {{"axes", axes}, {"starts", starts}, {"ends", ends}});
+            result.push_back(info.add_instruction(op, input_arg));
+        }
+        return result;
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_squeeze.cpp b/docker/rocm/migraphx/tf/parse_squeeze.cpp
new file mode 100644
index 000000000..8936c0ac3
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_squeeze.cpp
@@ -0,0 +1,64 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_squeeze : op_parser<parse_squeeze>
+{
+    std::vector<op_desc> operators() const { return {{"Squeeze"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto input_dims = args[0]->get_shape().lens();
+        auto axes       = info.attributes.at("squeeze_dims").list().i();
+        std::vector<int64_t> op_axes(axes.begin(), axes.end());
+
+        if(op_axes.empty()) // no squeeze_dims provided, remove any dim that equals 1
+        {
+            for(size_t i = 0; i < input_dims.size(); i++)
+            {
+                if(input_dims.at(i) == 1)
+                {
+                    op_axes.push_back(i);
+                }
+            }
+        }
+        return info.add_instruction(make_op("squeeze", {{"axes", op_axes}}),
+                                    info.make_contiguous(args[0]));
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_stridedslice.cpp b/docker/rocm/migraphx/tf/parse_stridedslice.cpp
new file mode 100644
index 000000000..c161e8953
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_stridedslice.cpp
@@ -0,0 +1,101 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_strideslice : op_parser<parse_strideslice>
+{
+    std::vector<op_desc> operators() const { return {{"StridedSlice"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          tf_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto starts              = args[1]->eval().get<int32_t>().to_vector();
+        auto ends                = args[2]->eval().get<int32_t>().to_vector();
+        auto l0                  = args[0];
+        size_t num_axes          = l0->get_shape().lens().size();
+        std::vector<size_t> axes = l0->get_shape().lens();
+
+        std::vector<int64_t> op_starts(starts.begin(), starts.end());
+        std::vector<int64_t> op_ends(ends.begin(), ends.end());
+        std::vector<int64_t> op_axes(num_axes);
+        std::iota(op_axes.begin(), op_axes.end(), 0);
+        uint32_t begin_mask       = 0;
+        uint32_t end_mask         = 0;
+        uint32_t shrink_axis_mask = 0;
+        uint32_t bitwise_compare  = 1;
+        std::vector<int64_t> squeeze_axes;
+
+        if(contains(info.attributes, "begin_mask"))
+            begin_mask = static_cast<uint32_t>(info.attributes.at("begin_mask").i());
+
+        if(contains(info.attributes, "end_mask"))
+            end_mask = static_cast<uint32_t>(info.attributes.at("end_mask").i());
+
+        if(contains(info.attributes, "shrink_axis_mask"))
+            shrink_axis_mask = static_cast<uint32_t>(info.attributes.at("shrink_axis_mask").i());
+
+        std::vector<int64_t> begin_axes = get_axes_from_mask(num_axes, begin_mask);
+        std::vector<int64_t> end_axes   = get_axes_from_mask(num_axes, end_mask);
+
+        for(size_t i = 0; i < num_axes; i++)
+        {
+            if(begin_axes.at(i) == 1)
+            {
+                op_starts.at(i) = 0;
+            }
+            if(end_axes.at(i) == 1)
+            {
+                op_ends.at(i) = axes.at(i);
+            }
+        }
+
+        auto op = make_op("slice", {{"starts", op_starts}, {"ends", op_ends}, {"axes", op_axes}});
+        auto l1 = info.add_instruction(op, l0);
+        if(shrink_axis_mask == 0)
+            return l1;
+
+        for(size_t i = 0; i < num_axes; i++)
+        {
+            // the LSB corresponds to axis 0 when determining which axes to squeeze
+            if(((shrink_axis_mask >> i) & bitwise_compare) == 1)
+                squeeze_axes.push_back(i);
+        }
+
+        return info.add_instruction(make_op("squeeze", {{"axes", squeeze_axes}}), l1);
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/parse_transpose.cpp b/docker/rocm/migraphx/tf/parse_transpose.cpp
new file mode 100644
index 000000000..9b306b97d
--- /dev/null
+++ b/docker/rocm/migraphx/tf/parse_transpose.cpp
@@ -0,0 +1,52 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/op_parser.hpp>
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+struct parse_transpose : op_parser<parse_transpose>
+{
+    std::vector<op_desc> operators() const { return {{"Transpose"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const tf_parser& /*parser*/,
+                          const tf_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto perm = args[1]->eval().get<int32_t>().to_vector();
+        std::vector<int64_t> dims(perm.begin(), perm.end());
+
+        return info.add_instruction(make_op("transpose", {{"permutation", dims}}), args.front());
+    }
+};
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/resource_handle.proto b/docker/rocm/migraphx/tf/resource_handle.proto
new file mode 100644
index 000000000..a54d3d906
--- /dev/null
+++ b/docker/rocm/migraphx/tf/resource_handle.proto
@@ -0,0 +1,30 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "ResourceHandle";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+
+// Protocol buffer representing a handle to a tensorflow resource. Handles are
+// not valid across executions, but can be serialized back and forth from within
+// a single run.
+message ResourceHandleProto {
+  // Unique name for the device containing the resource.
+  string device = 1;
+
+  // Container in which this resource is placed.
+  string container = 2;
+
+  // Unique name of this resource.
+  string name = 3;
+
+  // Hash code for the type of the resource. Is only valid in the same device
+  // and in the same execution.
+  uint64 hash_code = 4;
+
+  // For debug-only, the name of the type pointed to by this handle, if
+  // available.
+  string maybe_type_name = 5;
+};
diff --git a/docker/rocm/migraphx/tf/tensor.proto b/docker/rocm/migraphx/tf/tensor.proto
new file mode 100644
index 000000000..5d4d66aed
--- /dev/null
+++ b/docker/rocm/migraphx/tf/tensor.proto
@@ -0,0 +1,94 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "TensorProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "resource_handle.proto";
+import "tensor_shape.proto";
+import "types.proto";
+
+// Protocol buffer representing a tensor.
+message TensorProto {
+  DataType dtype = 1;
+
+  // Shape of the tensor.  TODO(touts): sort out the 0-rank issues.
+  TensorShapeProto tensor_shape = 2;
+
+  // Only one of the representations below is set, one of "tensor_contents" and
+  // the "xxx_val" attributes.  We are not using oneof because as oneofs cannot
+  // contain repeated fields it would require another extra set of messages.
+
+  // Version number.
+  //
+  // In version 0, if the "repeated xxx" representations contain only one
+  // element, that element is repeated to fill the shape.  This makes it easy
+  // to represent a constant Tensor with a single value.
+  int32 version_number = 3;
+
+  // Serialized raw tensor content from either Tensor::AsProtoTensorContent or
+  // memcpy in tensorflow::grpc::EncodeTensorToByteBuffer. This representation
+  // can be used for all tensor types. The purpose of this representation is to
+  // reduce serialization overhead during RPC call by avoiding serialization of
+  // many repeated small items.
+  bytes tensor_content = 4;
+
+  // Type specific representations that make it easy to create tensor protos in
+  // all languages.  Only the representation corresponding to "dtype" can
+  // be set.  The values hold the flattened representation of the tensor in
+  // row major order.
+
+  // DT_HALF, DT_BFLOAT16. Note that since protobuf has no int16 type, we'll
+  // have some pointless zero padding for each value here.
+  repeated int32 half_val = 13 [packed = true];
+
+  // DT_FLOAT.
+  repeated float float_val = 5 [packed = true];
+
+  // DT_DOUBLE.
+  repeated double double_val = 6 [packed = true];
+
+  // DT_INT32, DT_INT16, DT_INT8, DT_UINT8.
+  repeated int32 int_val = 7 [packed = true];
+
+  // DT_STRING
+  repeated bytes string_val = 8;
+
+  // DT_COMPLEX64. scomplex_val(2*i) and scomplex_val(2*i+1) are real
+  // and imaginary parts of i-th single precision complex.
+  repeated float scomplex_val = 9 [packed = true];
+
+  // DT_INT64
+  repeated int64 int64_val = 10 [packed = true];
+
+  // DT_BOOL
+  repeated bool bool_val = 11 [packed = true];
+
+  // DT_COMPLEX128. dcomplex_val(2*i) and dcomplex_val(2*i+1) are real
+  // and imaginary parts of i-th double precision complex.
+  repeated double dcomplex_val = 12 [packed = true];
+
+  // DT_RESOURCE
+  repeated ResourceHandleProto resource_handle_val = 14;
+
+  // DT_VARIANT
+  repeated VariantTensorDataProto variant_val = 15;
+
+  // DT_UINT32
+  repeated uint32 uint32_val = 16 [packed = true];
+
+  // DT_UINT64
+  repeated uint64 uint64_val = 17 [packed = true];
+};
+
+// Protocol buffer representing the serialization format of DT_VARIANT tensors.
+message VariantTensorDataProto {
+  // Name of the type of objects being serialized.
+  string type_name = 1;
+  // Portions of the object that are not Tensors.
+  bytes metadata = 2;
+  // Tensors contained within objects being serialized.
+  repeated TensorProto tensors = 3;
+}
diff --git a/docker/rocm/migraphx/tf/tensor_shape.proto b/docker/rocm/migraphx/tf/tensor_shape.proto
new file mode 100644
index 000000000..286156a01
--- /dev/null
+++ b/docker/rocm/migraphx/tf/tensor_shape.proto
@@ -0,0 +1,46 @@
+// Protocol buffer representing the shape of tensors.
+
+syntax = "proto3";
+option cc_enable_arenas = true;
+option java_outer_classname = "TensorShapeProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+
+package tensorflow;
+
+// Dimensions of a tensor.
+message TensorShapeProto {
+  // One dimension of the tensor.
+  message Dim {
+    // Size of the tensor in that dimension.
+    // This value must be >= -1, but values of -1 are reserved for "unknown"
+    // shapes (values of -1 mean "unknown" dimension).  Certain wrappers
+    // that work with TensorShapeProto may fail at runtime when deserializing
+    // a TensorShapeProto containing a dim value of -1.
+    int64 size = 1;
+
+    // Optional name of the tensor dimension.
+    string name = 2;
+  };
+
+  // Dimensions of the tensor, such as {"input", 30}, {"output", 40}
+  // for a 30 x 40 2D tensor.  If an entry has size -1, this
+  // corresponds to a dimension of unknown size. The names are
+  // optional.
+  //
+  // The order of entries in "dim" matters: It indicates the layout of the
+  // values in the tensor in-memory representation.
+  //
+  // The first entry in "dim" is the outermost dimension used to layout the
+  // values, the last entry is the innermost dimension.  This matches the
+  // in-memory layout of RowMajor Eigen tensors.
+  //
+  // If "dim.size()" > 0, "unknown_rank" must be false.
+  repeated Dim dim = 2;
+
+  // If true, the number of dimensions in the shape is unknown.
+  //
+  // If true, "dim.size()" must be 0.
+  bool unknown_rank = 3;
+};
diff --git a/docker/rocm/migraphx/tf/tf.cpp b/docker/rocm/migraphx/tf/tf.cpp
new file mode 100644
index 000000000..7b6c1322d
--- /dev/null
+++ b/docker/rocm/migraphx/tf/tf.cpp
@@ -0,0 +1,85 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/tf/op_parser.hpp>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <functional>
+#include <array>
+#include <utility>
+#include <vector>
+
+#include <migraphx/program.hpp>
+#include <migraphx/tf.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+template <class... Ts>
+program parse_tf_from(const tf_options& options, Ts&&... xs)
+{
+    tf::tf_parser parser;
+    parser.is_nhwc           = options.is_nhwc;
+    parser.batch_size        = options.batch_size;
+    parser.map_input_dims    = options.map_input_dims;
+    parser.output_node_names = options.output_node_names;
+
+#ifndef NDEBUG
+    // Log the program when it can't be parsed
+    try
+    {
+        parser.parse_from(std::forward<Ts>(xs)...);
+    }
+    catch(...)
+    {
+        std::cerr << parser.prog << std::endl;
+        throw;
+    }
+#else
+    parser.parse_from(std::forward<Ts>(xs)...);
+#endif
+    return std::move(parser.prog);
+}
+
+program parse_tf(const std::string& name, const tf_options& options)
+{
+    std::fstream input(name.c_str(), std::ios::in | std::ios::binary);
+    return parse_tf_from(options, input);
+}
+
+program parse_tf_buffer(const std::string& buffer, const tf_options& options)
+{
+    return parse_tf_from(options, buffer.data(), buffer.size());
+}
+
+program parse_tf_buffer(const void* data, std::size_t size, const tf_options& options)
+{
+    return parse_tf_from(options, data, size);
+}
+
+std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/tf_parser.cpp b/docker/rocm/migraphx/tf/tf_parser.cpp
new file mode 100644
index 000000000..a53c7b29e
--- /dev/null
+++ b/docker/rocm/migraphx/tf/tf_parser.cpp
@@ -0,0 +1,605 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <graph.pb.h>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <functional>
+#include <array>
+#include <utility>
+#include <vector>
+
+#include <migraphx/fallthrough.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/op/unknown.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/tf.hpp>
+#include <migraphx/common.hpp>
+#include <migraphx/make_op.hpp>
+
+#include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/tf/op_parser.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace tf {
+
+bool tf_parser::should_transpose(instruction_ref ins) const
+{
+    return is_nhwc and ins->get_shape().lens().size() == 4;
+}
+
+instruction_ref tf_parser::to_nhwc(instruction_ref ins) const
+{
+    if(should_transpose(ins))
+        return mm->add_instruction(make_op("transpose", {{"permutation", {0, 2, 3, 1}}}), ins);
+    return ins;
+}
+
+instruction_ref tf_parser::to_nchw(instruction_ref ins) const
+{
+    if(should_transpose(ins))
+        return mm->add_instruction(make_op("transpose", {{"permutation", {0, 3, 1, 2}}}), ins);
+    return ins;
+}
+
+instruction_ref tf_parser::to_kcxy(instruction_ref ins) const
+{
+    return mm->add_instruction(make_op("transpose", {{"permutation", {3, 2, 0, 1}}}), ins);
+}
+
+std::vector<instruction_ref> tf_parser::to_nchw(const std::vector<instruction_ref>& args) const
+{
+    std::vector<instruction_ref> result(args.size());
+    std::transform(
+        args.begin(), args.end(), result.begin(), [&](auto ins) { return this->to_nchw(ins); });
+    return result;
+}
+
+std::vector<instruction_ref> tf_parser::to_nhwc(const std::vector<instruction_ref>& args) const
+{
+    std::vector<instruction_ref> result(args.size());
+    std::transform(
+        args.begin(), args.end(), result.begin(), [&](auto ins) { return this->to_nhwc(ins); });
+    return result;
+}
+
+instruction_ref tf_parser::node_info::make_contiguous(instruction_ref ins) const
+{
+    if(ins->get_shape().standard())
+        return ins;
+    else
+        return mm->add_instruction(make_op("contiguous"), ins);
+}
+
+instruction_ref tf_parser::node_info::add_broadcastable_binary_op(const std::string& op_name,
+                                                                  instruction_ref arg0,
+                                                                  instruction_ref arg1) const
+{
+    return this->add_common_op(op_name, arg0, arg1);
+}
+
+instruction_ref tf_parser::node_info::add_common_op(const std::string& op_name,
+                                                    std::vector<instruction_ref> inputs) const
+{
+    return migraphx::add_common_op(*mm, make_op(op_name), std::move(inputs));
+}
+
+int64_t tf_parser::parse_axis(const int64_t dim, const size_t num_dims) const
+{
+    int64_t new_dim = dim;
+    if(is_nhwc and num_dims >= 4)
+    {
+        switch(dim)
+        {
+        case 0: new_dim = 0; break;
+        case 1: new_dim = 2; break;
+        case 2: new_dim = 3; break;
+        case 3: new_dim = 1; break;
+        default: break;
+        }
+    }
+    return new_dim;
+}
+
+instruction_ref
+tf_parser::node_info::add_instruction(const operation& op,
+                                      const std::vector<instruction_ref>& args) const
+{
+    return mm->add_instruction(op, args);
+}
+
+instruction_ref tf_parser::node_info::add_literal(literal l) const
+{
+    return mm->add_literal(std::move(l));
+}
+
+std::vector<int64_t> get_axes_from_mask(const size_t num_axes, const uint32_t mask)
+{
+    uint32_t bitwise_compare = 1;
+    std::vector<int64_t> axes;
+    for(size_t i = 0; i < num_axes; i++)
+    {
+        // the LSB corresponds to axis 0 when determining which axes to begin
+        if(((mask >> i) & bitwise_compare) == 1)
+            axes.push_back(1);
+        else
+            axes.push_back(0);
+    }
+    return axes;
+}
+
+tf_parser::tf_parser()
+{
+    // Add all registered op parsers
+    for(auto&& name : get_op_parsers())
+        ops.emplace(name, get_op_parser(name));
+}
+
+static std::string get_name(const tensorflow::NodeDef& node) { return node.name(); }
+
+static tf_parser::node_map get_nodes(const tensorflow::GraphDef& graph,
+                                     std::vector<tensorflow::NodeDef>& input_nodes)
+{
+    tf_parser::node_map result;
+    for(auto&& node : graph.node())
+    {
+        auto node_name = get_name(node);
+        // assume each node in graph has an associated name
+        if(node_name.empty())
+            MIGRAPHX_THROW("tf node with no name found");
+        result[node_name] = node;
+        if(node.op() == "Placeholder")
+        {
+            input_nodes.push_back(node);
+        }
+    }
+    return result;
+}
+
+static tf_parser::attribute_map get_attributes(const tensorflow::NodeDef& node)
+{
+    tf_parser::attribute_map result;
+    for(auto&& attr : node.attr())
+    {
+        result[attr.first] = attr.second;
+    }
+
+    return result;
+}
+
+static std::vector<size_t> parse_dims(const tensorflow::TensorShapeProto& s)
+{
+    std::vector<size_t> dims;
+    auto input_dims = s.dim();
+    std::transform(input_dims.begin(),
+                   input_dims.end(),
+                   std::back_inserter(dims),
+                   [](const tensorflow::TensorShapeProto_Dim& dim) { return dim.size(); });
+    return dims;
+}
+
+template <class T>
+static std::vector<T> get_data_vals(const google::protobuf::RepeatedField<T>& data,
+                                    const size_t& shape_size)
+{
+    std::vector<T> data_vals(shape_size);
+    // check if shape has enough data values given existing fields
+    if(data.size() == 1)
+    {
+        std::fill(data_vals.begin(), data_vals.end(), data[0]);
+    }
+    else
+        copy(data.begin(), data.end(), data_vals.begin());
+    return data_vals;
+}
+
+template <class T>
+static literal
+create_literal(shape::type_t shape_type, const std::vector<size_t>& dims, std::vector<T> data)
+{
+    // assume if explicit value is mentioned in protobuf and dim size <= 1, treat as scalar
+    if(dims.empty() or (dims.size() == 1 and dims.front() == 1))
+        return literal{{shape_type}, data};
+    return literal{{shape_type, dims}, data};
+}
+
+static bool is_valid_op(const tensorflow::NodeDef& node)
+{
+    std::vector<std::string> ignored{"NoOp", "Assert"};
+    return none_of(ignored, [&](const auto& op) {
+        const auto& name = get_name(node);
+        return node.op() == op or contains(name, op);
+    });
+}
+
+std::vector<std::string> tf_parser::find_outputs() const
+{
+    std::unordered_set<std::string> inputs;
+    for(auto&& p : nodes)
+    {
+        auto&& node = p.second;
+        std::copy(node.input().begin(), node.input().end(), std::inserter(inputs, inputs.end()));
+    }
+    std::vector<std::string> outputs;
+    for(auto&& p : nodes)
+    {
+        const auto& name = p.first;
+        const auto& node = p.second;
+        if(not is_valid_op(node))
+            continue;
+        // control flow related, ignore this node
+        if(contains(name, "^"))
+            continue;
+        // literals are valid ops, but they are not outputs unless specified
+        if(node.op() == "Const")
+            continue;
+        if(inputs.count(name) == 0)
+            outputs.push_back(name);
+    }
+    return outputs;
+}
+
+void tf_parser::parse_graph(const tensorflow::GraphDef& graph)
+{
+    nodes = get_nodes(graph, input_nodes);
+    for(auto&& input : input_nodes)
+    {
+        const std::string& name   = input.name();
+        attribute_map input_attrs = get_attributes(input);
+        shape::type_t shape_type  = parse_type(input_attrs.at("dtype").type());
+        std::vector<size_t> dims  = parse_dims(input_attrs.at("shape").shape());
+
+        if(contains(map_input_dims, name))
+        {
+            dims = map_input_dims.at(name);
+        }
+        else
+        {
+            if(is_nhwc and dims.size() >= 4)
+            {
+                this->reorder_data(dims);
+            }
+            std::transform(dims.begin(), dims.end(), dims.begin(), [&](auto dim) {
+                return static_cast<int>(dim) <= 0 ? batch_size : dim;
+            });
+        }
+
+        shape s            = shape{shape_type, dims};
+        instructions[name] = to_nhwc(mm->add_parameter(name, s));
+    }
+    for(auto&& p : nodes)
+    {
+        this->parse_node(p.first);
+    }
+    if(mm->size() == 0)
+        return;
+
+    // Needs to add a ret instruction at the end of
+    // the program
+    if(output_node_names.empty())
+    {
+        output_node_names = find_outputs();
+    }
+
+    std::vector<instruction_ref> output_ins;
+    std::transform(output_node_names.begin(),
+                   output_node_names.end(),
+                   std::back_inserter(output_ins),
+                   [&](auto output_name) {
+                       if(not contains(instructions, output_name))
+                           MIGRAPHX_THROW("PARSE_TF: output name " + output_name +
+                                          " not found in graph!");
+                       return this->to_nchw(instructions[output_name]);
+                   });
+    mm->add_return(output_ins);
+}
+
+void tf_parser::parse_node(const std::string& name)
+{
+    if(instructions.count(name) == 0)
+    {
+        auto&& node = nodes.at(name);
+        if(not is_valid_op(node))
+            return;
+        std::vector<instruction_ref> args;
+        for(auto&& input : node.input())
+        {
+            // control dependencies (signified by ^ before the name) are ignored
+            if(contains(input, "^"))
+                continue;
+            std::string input_name = input;
+            // if input has trailing `:0` index then remove it
+            auto multi_out_idx = input.find(':');
+            if(multi_out_idx != std::string::npos and input.substr(multi_out_idx + 1) == "0")
+            {
+                input_name = input.substr(0, multi_out_idx);
+            }
+            if(nodes.count(input_name) > 0)
+            {
+                // input was from a node with multiple outputs
+                if(contains(input_name, ':'))
+                {
+                    input_name.resize(input.find(':'));
+                }
+                else
+                {
+                    input_name = get_name(nodes.at(input_name));
+                }
+                assert(name != input_name);
+                this->parse_node(input_name);
+                args.push_back(instructions.at(input_name));
+            }
+            else
+            {
+                args.push_back(instructions.at(input_name));
+            }
+        }
+        std::vector<instruction_ref> result;
+        if(ops.count(node.op()) == 0)
+        {
+            result.push_back(mm->add_instruction(op::unknown{node.op()}, args));
+        }
+        else
+        {
+            result = ops[node.op()](*this, {get_attributes(node), node.op(), mm}, args);
+        }
+        assert(not result.empty());
+        // First output has no ":" delimiter
+        instructions[name] = result.front();
+        for(size_t i = 1; i < result.size(); i++)
+        {
+            instructions[name + ":" + std::to_string(i)] = result.at(i);
+        }
+    }
+}
+
+void tf_parser::parse_from(std::istream& is)
+{
+    tensorflow::GraphDef graph;
+    if(graph.ParseFromIstream(&is))
+    {
+        this->parse_graph(graph);
+    }
+    else
+    {
+        throw std::runtime_error("Failed reading tf file");
+    }
+}
+
+void tf_parser::parse_from(const void* data, std::size_t size)
+{
+    tensorflow::GraphDef graph;
+    if(graph.ParseFromArray(data, size))
+    {
+        this->parse_graph(graph);
+    }
+    else
+    {
+        throw std::runtime_error("Failed reading tf buffer array");
+    }
+}
+
+shape::type_t tf_parser::parse_type(const tensorflow::DataType t) const
+{
+    shape::type_t shape_type{};
+    switch(t)
+    {
+    case tensorflow::DataType::DT_FLOAT: shape_type = shape::float_type; break;
+    case tensorflow::DataType::DT_DOUBLE: shape_type = shape::double_type; break;
+    case tensorflow::DataType::DT_INT32: shape_type = shape::int32_type; break;
+    case tensorflow::DataType::DT_INT16: shape_type = shape::int16_type; break;
+    case tensorflow::DataType::DT_INT8: shape_type = shape::int8_type; break;
+    case tensorflow::DataType::DT_INT64: shape_type = shape::int64_type; break;
+    case tensorflow::DataType::DT_UINT16: shape_type = shape::uint16_type; break;
+    case tensorflow::DataType::DT_HALF: shape_type = shape::half_type; break;
+    case tensorflow::DataType::DT_UINT32: shape_type = shape::uint32_type; break;
+    case tensorflow::DataType::DT_UINT64: shape_type = shape::uint64_type; break;
+
+    case tensorflow::DataType::DT_INVALID:
+    case tensorflow::DataType::DT_UINT8:
+    case tensorflow::DataType::DT_STRING:
+    case tensorflow::DataType::DT_COMPLEX64:
+    case tensorflow::DataType::DT_BOOL:
+    case tensorflow::DataType::DT_QINT8:
+    case tensorflow::DataType::DT_QUINT8:
+    case tensorflow::DataType::DT_QINT32:
+    case tensorflow::DataType::DT_BFLOAT16:
+    case tensorflow::DataType::DT_QINT16:
+    case tensorflow::DataType::DT_QUINT16:
+    case tensorflow::DataType::DT_COMPLEX128:
+    case tensorflow::DataType::DT_RESOURCE:
+    case tensorflow::DataType::DT_VARIANT:
+    // tf pb should not use these types
+    case tensorflow::DataType::DT_FLOAT_REF:
+    case tensorflow::DataType::DT_DOUBLE_REF:
+    case tensorflow::DataType::DT_INT32_REF:
+    case tensorflow::DataType::DT_UINT8_REF:
+    case tensorflow::DataType::DT_INT16_REF:
+    case tensorflow::DataType::DT_INT8_REF:
+    case tensorflow::DataType::DT_STRING_REF:
+    case tensorflow::DataType::DT_COMPLEX64_REF:
+    case tensorflow::DataType::DT_INT64_REF:
+    case tensorflow::DataType::DT_BOOL_REF:
+    case tensorflow::DataType::DT_QINT8_REF:
+    case tensorflow::DataType::DT_QUINT8_REF:
+    case tensorflow::DataType::DT_QINT32_REF:
+    case tensorflow::DataType::DT_BFLOAT16_REF:
+    case tensorflow::DataType::DT_QINT16_REF:
+    case tensorflow::DataType::DT_QUINT16_REF:
+    case tensorflow::DataType::DT_UINT16_REF:
+    case tensorflow::DataType::DT_COMPLEX128_REF:
+    case tensorflow::DataType::DT_HALF_REF:
+    case tensorflow::DataType::DT_RESOURCE_REF:
+    case tensorflow::DataType::DT_VARIANT_REF:
+    case tensorflow::DataType::DT_UINT32_REF:
+    case tensorflow::DataType::DT_UINT64_REF:
+    case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_:
+    case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_: break;
+    }
+    return shape_type;
+}
+
+literal tf_parser::parse_tensor(const tensorflow::TensorProto& t) const
+{
+    std::vector<size_t> dims = parse_dims(t.tensor_shape());
+    size_t shape_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
+    if(not t.tensor_content().empty()) // has raw data
+    {
+        const std::string& s = t.tensor_content();
+        switch(t.dtype())
+        {
+        case tensorflow::DataType::DT_FLOAT: return literal{{shape::float_type, dims}, s.data()};
+        case tensorflow::DataType::DT_BOOL:
+        case tensorflow::DataType::DT_INT8: return literal{{shape::int8_type, dims}, s.data()};
+        case tensorflow::DataType::DT_UINT16:
+        case tensorflow::DataType::DT_INT16: return literal{{shape::int16_type, dims}, s.data()};
+        case tensorflow::DataType::DT_INT32: return literal{{shape::int32_type, dims}, s.data()};
+        case tensorflow::DataType::DT_INT64: return literal{{shape::int64_type, dims}, s.data()};
+        case tensorflow::DataType::DT_HALF: return literal{{shape::half_type, dims}, s.data()};
+        case tensorflow::DataType::DT_DOUBLE: return literal{{shape::double_type, dims}, s.data()};
+        case tensorflow::DataType::DT_INVALID:
+        case tensorflow::DataType::DT_UINT8:
+        case tensorflow::DataType::DT_STRING:
+        case tensorflow::DataType::DT_UINT32:
+        case tensorflow::DataType::DT_UINT64:
+        case tensorflow::DataType::DT_COMPLEX64:
+        case tensorflow::DataType::DT_COMPLEX128:
+        case tensorflow::DataType::DT_QINT8:
+        case tensorflow::DataType::DT_QUINT8:
+        case tensorflow::DataType::DT_QINT32:
+        case tensorflow::DataType::DT_BFLOAT16:
+        case tensorflow::DataType::DT_QINT16:
+        case tensorflow::DataType::DT_QUINT16:
+        case tensorflow::DataType::DT_RESOURCE:
+        case tensorflow::DataType::DT_VARIANT:
+        case tensorflow::DataType::DT_FLOAT_REF:
+        case tensorflow::DataType::DT_DOUBLE_REF:
+        case tensorflow::DataType::DT_INT32_REF:
+        case tensorflow::DataType::DT_UINT8_REF:
+        case tensorflow::DataType::DT_INT16_REF:
+        case tensorflow::DataType::DT_INT8_REF:
+        case tensorflow::DataType::DT_STRING_REF:
+        case tensorflow::DataType::DT_COMPLEX64_REF:
+        case tensorflow::DataType::DT_INT64_REF:
+        case tensorflow::DataType::DT_BOOL_REF:
+        case tensorflow::DataType::DT_QINT8_REF:
+        case tensorflow::DataType::DT_QUINT8_REF:
+        case tensorflow::DataType::DT_QINT32_REF:
+        case tensorflow::DataType::DT_BFLOAT16_REF:
+        case tensorflow::DataType::DT_QINT16_REF:
+        case tensorflow::DataType::DT_QUINT16_REF:
+        case tensorflow::DataType::DT_UINT16_REF:
+        case tensorflow::DataType::DT_COMPLEX128_REF:
+        case tensorflow::DataType::DT_HALF_REF:
+        case tensorflow::DataType::DT_RESOURCE_REF:
+        case tensorflow::DataType::DT_VARIANT_REF:
+        case tensorflow::DataType::DT_UINT32_REF:
+        case tensorflow::DataType::DT_UINT64_REF:
+        case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_:
+        case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_:
+            throw std::runtime_error("");
+        }
+        MIGRAPHX_THROW("Invalid tensor type");
+    }
+    switch(t.dtype())
+    {
+    case tensorflow::DataType::DT_FLOAT:
+        return create_literal(shape::float_type, dims, get_data_vals(t.float_val(), shape_size));
+    case tensorflow::DataType::DT_INT8:
+        return create_literal(shape::int8_type, dims, get_data_vals(t.int_val(), shape_size));
+    case tensorflow::DataType::DT_UINT16:
+        return create_literal(shape::uint16_type, dims, get_data_vals(t.int_val(), shape_size));
+    case tensorflow::DataType::DT_INT16:
+        return create_literal(shape::int16_type, dims, get_data_vals(t.int_val(), shape_size));
+    case tensorflow::DataType::DT_INT32:
+        return create_literal(shape::int32_type, dims, get_data_vals(t.int_val(), shape_size));
+    case tensorflow::DataType::DT_INT64:
+        return create_literal(shape::int64_type, dims, get_data_vals(t.int64_val(), shape_size));
+    case tensorflow::DataType::DT_BOOL:
+        return create_literal(shape::int32_type, dims, get_data_vals(t.bool_val(), shape_size));
+    case tensorflow::DataType::DT_HALF: {
+        std::vector<int> data_int32 = get_data_vals(t.half_val(), shape_size);
+        std::vector<uint16_t> data_uint16(data_int32.begin(), data_int32.end());
+        std::vector<half> data_half;
+        std::transform(data_uint16.begin(),
+                       data_uint16.end(),
+                       std::back_inserter(data_half),
+                       [](uint16_t raw_val) { return *reinterpret_cast<half*>(&raw_val); });
+        return create_literal(shape::half_type, dims, data_half);
+    }
+    case tensorflow::DataType::DT_DOUBLE:
+        return literal{{shape::double_type, dims}, get_data_vals(t.double_val(), shape_size)};
+    case tensorflow::DataType::DT_INVALID:
+    case tensorflow::DataType::DT_UINT8:
+    case tensorflow::DataType::DT_STRING:
+    case tensorflow::DataType::DT_UINT32:
+    case tensorflow::DataType::DT_UINT64:
+    case tensorflow::DataType::DT_COMPLEX64:
+    case tensorflow::DataType::DT_COMPLEX128:
+    case tensorflow::DataType::DT_QINT8:
+    case tensorflow::DataType::DT_QUINT8:
+    case tensorflow::DataType::DT_QINT32:
+    case tensorflow::DataType::DT_BFLOAT16:
+    case tensorflow::DataType::DT_QINT16:
+    case tensorflow::DataType::DT_QUINT16:
+    case tensorflow::DataType::DT_RESOURCE:
+    case tensorflow::DataType::DT_VARIANT:
+    case tensorflow::DataType::DT_FLOAT_REF:
+    case tensorflow::DataType::DT_DOUBLE_REF:
+    case tensorflow::DataType::DT_INT32_REF:
+    case tensorflow::DataType::DT_UINT8_REF:
+    case tensorflow::DataType::DT_INT16_REF:
+    case tensorflow::DataType::DT_INT8_REF:
+    case tensorflow::DataType::DT_STRING_REF:
+    case tensorflow::DataType::DT_COMPLEX64_REF:
+    case tensorflow::DataType::DT_INT64_REF:
+    case tensorflow::DataType::DT_BOOL_REF:
+    case tensorflow::DataType::DT_QINT8_REF:
+    case tensorflow::DataType::DT_QUINT8_REF:
+    case tensorflow::DataType::DT_QINT32_REF:
+    case tensorflow::DataType::DT_BFLOAT16_REF:
+    case tensorflow::DataType::DT_QINT16_REF:
+    case tensorflow::DataType::DT_QUINT16_REF:
+    case tensorflow::DataType::DT_UINT16_REF:
+    case tensorflow::DataType::DT_COMPLEX128_REF:
+    case tensorflow::DataType::DT_HALF_REF:
+    case tensorflow::DataType::DT_RESOURCE_REF:
+    case tensorflow::DataType::DT_VARIANT_REF:
+    case tensorflow::DataType::DT_UINT32_REF:
+    case tensorflow::DataType::DT_UINT64_REF:
+    case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_:
+    case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_: throw std::runtime_error("");
+    }
+    MIGRAPHX_THROW("Invalid tensor type");
+}
+
+} // namespace tf
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/docker/rocm/migraphx/tf/types.proto b/docker/rocm/migraphx/tf/types.proto
new file mode 100644
index 000000000..03835d1b9
--- /dev/null
+++ b/docker/rocm/migraphx/tf/types.proto
@@ -0,0 +1,75 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "TypesProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+
+// LINT.IfChange
+enum DataType {
+  // Not a legal value for DataType.  Used to indicate a DataType field
+  // has not been set.
+  DT_INVALID = 0;
+
+  // Data types that all computation devices are expected to be
+  // capable to support.
+  DT_FLOAT = 1;
+  DT_DOUBLE = 2;
+  DT_INT32 = 3;
+  DT_UINT8 = 4;
+  DT_INT16 = 5;
+  DT_INT8 = 6;
+  DT_STRING = 7;
+  DT_COMPLEX64 = 8;  // Single-precision complex
+  DT_INT64 = 9;
+  DT_BOOL = 10;
+  DT_QINT8 = 11;     // Quantized int8
+  DT_QUINT8 = 12;    // Quantized uint8
+  DT_QINT32 = 13;    // Quantized int32
+  DT_BFLOAT16 = 14;  // Float32 truncated to 16 bits.  Only for cast ops.
+  DT_QINT16 = 15;    // Quantized int16
+  DT_QUINT16 = 16;   // Quantized uint16
+  DT_UINT16 = 17;
+  DT_COMPLEX128 = 18;  // Double-precision complex
+  DT_HALF = 19;
+  DT_RESOURCE = 20;
+  DT_VARIANT = 21;  // Arbitrary C++ data types
+  DT_UINT32 = 22;
+  DT_UINT64 = 23;
+
+  // Do not use!  These are only for parameters.  Every enum above
+  // should have a corresponding value below (verified by types_test).
+  DT_FLOAT_REF = 101;
+  DT_DOUBLE_REF = 102;
+  DT_INT32_REF = 103;
+  DT_UINT8_REF = 104;
+  DT_INT16_REF = 105;
+  DT_INT8_REF = 106;
+  DT_STRING_REF = 107;
+  DT_COMPLEX64_REF = 108;
+  DT_INT64_REF = 109;
+  DT_BOOL_REF = 110;
+  DT_QINT8_REF = 111;
+  DT_QUINT8_REF = 112;
+  DT_QINT32_REF = 113;
+  DT_BFLOAT16_REF = 114;
+  DT_QINT16_REF = 115;
+  DT_QUINT16_REF = 116;
+  DT_UINT16_REF = 117;
+  DT_COMPLEX128_REF = 118;
+  DT_HALF_REF = 119;
+  DT_RESOURCE_REF = 120;
+  DT_VARIANT_REF = 121;
+  DT_UINT32_REF = 122;
+  DT_UINT64_REF = 123;
+}
+// LINT.ThenChange(
+//    https://www.tensorflow.org/code/tensorflow/c/c_api.h,
+//    https://www.tensorflow.org/code/tensorflow/go/tensor.go,
+//    https://www.tensorflow.org/code/tensorflow/core/framework/tensor.cc,
+//    https://www.tensorflow.org/code/tensorflow/core/framework/types.h,
+//    https://www.tensorflow.org/code/tensorflow/core/framework/types.cc,
+//    https://www.tensorflow.org/code/tensorflow/python/framework/dtypes.py,
+//    https://www.tensorflow.org/code/tensorflow/python/framework/function.py)
diff --git a/docker/rocm/migraphx/tf/versions.proto b/docker/rocm/migraphx/tf/versions.proto
new file mode 100644
index 000000000..dd2ec5523
--- /dev/null
+++ b/docker/rocm/migraphx/tf/versions.proto
@@ -0,0 +1,32 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "VersionsProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+
+// Version information for a piece of serialized data
+//
+// There are different types of versions for each type of data
+// (GraphDef, etc.), but they all have the same common shape
+// described here.
+//
+// Each consumer has "consumer" and "min_producer" versions (specified
+// elsewhere).  A consumer is allowed to consume this data if
+//
+//   producer >= min_producer
+//   consumer >= min_consumer
+//   consumer not in bad_consumers
+//
+message VersionDef {
+  // The version of the code that produced this data.
+  int32 producer = 1;
+
+  // Any consumer below this version is not allowed to consume this data.
+  int32 min_consumer = 2;
+
+  // Specific consumer versions which are disallowed (e.g. due to bugs).
+  repeated int32 bad_consumers = 3;
+};