frigate/docker/rocm/migraphx/onnx/parse_multi_head_attention.cpp
WhiteWolf84 7eefb89bf6 upload
2025-02-03 22:01:20 +01:00

296 lines
13 KiB
C++

/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/onnx/op_parser.hpp>
#include <migraphx/errors.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/ranges.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace onnx {
enum class qkv_fomat_t
{
q_k_v = 0,
q_k_v_cross = 1,
kv_packed = 2,
qkv_packed = 3
};
struct multi_head_attention_parameters
{
int64_t batch_size;
int64_t q_sequence_length;
int64_t kv_sequence_length;
int64_t hidden_size;
int64_t hidden_size_v;
int64_t head_size;
int64_t head_size_v;
qkv_fomat_t qkv_fomat;
};
struct parse_multi_head_attention : op_parser<parse_multi_head_attention>
{
std::vector<op_desc> operators() const { return {{"MultiHeadAttention"}}; }
void unpack_qkv(const onnx_parser::node_info& info,
instruction_ref& query,
instruction_ref& key,
instruction_ref& value) const
{
// (batch_size, q_sequence_length, num_heads, 3, head_size) ->
// (3, batch_size, q_sequence_length, num_heads, head_size)
auto qkv_packed =
info.add_instruction(make_op("transpose", {{"permutation", {3, 0, 1, 2, 4}}}), query);
query = info.add_instruction(
make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), qkv_packed);
query = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), query);
key = info.add_instruction(
make_op("slice", {{"axes", {0}}, {"starts", {1}}, {"ends", {2}}}), qkv_packed);
key = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), key);
value = info.add_instruction(
make_op("slice", {{"axes", {0}}, {"starts", {2}}, {"ends", {3}}}), qkv_packed);
value = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), value);
}
void unpack_kv(const onnx_parser::node_info& info,
instruction_ref& key,
instruction_ref& value) const
{
// (batch_size, kv_sequence_length, num_heads, 2, head_size) ->
// (2, batch_size, kv_sequence_length, num_heads, head_size)
auto kv_packed =
info.add_instruction(make_op("transpose", {{"permutation", {3, 0, 1, 2, 4}}}), key);
key = info.add_instruction(
make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), kv_packed);
key = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), key);
value = info.add_instruction(
make_op("slice", {{"axes", {0}}, {"starts", {1}}, {"ends", {2}}}), kv_packed);
value = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), value);
}
void check_inputs(const std::vector<instruction_ref>& args,
const int64_t num_heads,
multi_head_attention_parameters& params) const
{
if(args.empty() or args.size() > 3)
MIGRAPHX_THROW("MultiHeadAttention: Wrong number of inputs. Only 'query', 'key' and "
"'value' inputs are supported.");
auto query_dim = args[0]->get_shape().ndim();
auto query_lens = args[0]->get_shape().lens();
params.batch_size = query_lens[0];
params.q_sequence_length = query_lens[1];
if(query_dim != 3 and query_dim != 5)
MIGRAPHX_THROW("MultiHeadAttention: Input 'query' rank needs to be 3 or 5, current: " +
std::to_string(query_dim));
if(query_dim == 5)
{
if(query_lens[2] != num_heads or query_lens[3] != 3)
MIGRAPHX_THROW("MultiHeadAttention: Input 'query' shape needs to be (batch_size, "
"q_sequence_length, num_heads, 3, head_size) for packed input.");
params.kv_sequence_length = query_lens[1];
params.head_size = query_lens[4];
params.head_size_v = query_lens[4];
params.hidden_size = num_heads * query_lens[4];
params.hidden_size_v = num_heads * query_lens[4];
params.qkv_fomat = qkv_fomat_t::qkv_packed;
}
else // query_dim == 3
{
if(args.size() < 2)
MIGRAPHX_THROW("MultiHeadAttention: Wrong number of inputs, 'key' is missing.");
params.hidden_size = query_lens[2];
params.head_size = query_lens[2] / num_heads;
auto key_dim = args[1]->get_shape().ndim();
auto key_lens = args[1]->get_shape().lens();
if(key_dim < 3 or key_dim > 5)
MIGRAPHX_THROW(
"MultiHeadAttention: Input 'key' rank needs to be 3, 4 or 5, current: " +
std::to_string(key_dim));
if(key_dim == 5)
{
if(key_lens[0] != params.batch_size or key_lens[2] != num_heads or
key_lens[3] != 2 or key_lens[4] != params.head_size)
MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be (batch_size, "
"kv_sequence_length, num_heads, 2, head_size)");
params.kv_sequence_length = key_lens[1];
params.hidden_size_v = params.hidden_size;
params.head_size_v = key_lens[4];
params.qkv_fomat = qkv_fomat_t::kv_packed;
}
else
{
if(args.size() < 3)
MIGRAPHX_THROW(
"MultiHeadAttention: Wrong number of inputs, 'value' is missing.");
auto value_dim = args[2]->get_shape().ndim();
auto value_lens = args[2]->get_shape().lens();
if(key_dim != value_dim)
MIGRAPHX_THROW(
"MultiHeadAttention: Input 'key' and 'value' rank needs to be equal.");
if(key_dim == 3)
{
if(key_lens[0] != params.batch_size or key_lens[2] != params.hidden_size)
MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be "
"(batch_size, kv_sequence_length, hidden_size)");
if(value_lens[0] != params.batch_size or value_lens[1] != key_lens[1])
MIGRAPHX_THROW("MultiHeadAttention: Input 'value' shape needs to be "
"(batch_size, kv_sequence_length, hidden_size_v)");
params.kv_sequence_length = key_lens[1];
params.hidden_size_v = value_lens[2];
params.head_size_v = value_lens[2] / num_heads;
params.qkv_fomat = qkv_fomat_t::q_k_v;
}
else // key_dim == 4
{
if(key_lens[0] != params.batch_size or key_lens[1] != num_heads or
key_lens[3] != params.head_size)
MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be "
"(batch_size, num_heads, kv_sequence_length, head_size)");
if(value_lens[0] != params.batch_size or value_lens[1] != num_heads or
value_lens[2] != key_lens[2])
MIGRAPHX_THROW("MultiHeadAttention: Input 'value' shape needs to be "
"(batch_size, num_heads, kv_sequence_length, head_size_v)");
params.kv_sequence_length = key_lens[2];
params.hidden_size_v = value_lens[3] * num_heads;
params.head_size_v = value_lens[3];
params.qkv_fomat = qkv_fomat_t::q_k_v_cross;
}
}
}
}
instruction_ref parse(const op_desc& /*opd*/,
const onnx_parser& parser,
const onnx_parser::node_info& info,
const std::vector<instruction_ref>& args) const
{
if(not contains(info.attributes, "num_heads"))
MIGRAPHX_THROW("MultiHeadAttention: num_heads attribute is required");
int64_t num_heads = parser.parse_value(info.attributes.at("num_heads")).at<int>();
multi_head_attention_parameters params;
check_inputs(args, num_heads, params);
auto query = args[0];
instruction_ref key;
instruction_ref value;
if(params.qkv_fomat == qkv_fomat_t::qkv_packed)
{
// Packed QKV: (batch_size, q_sequence_length, num_heads, 3, head_size)
unpack_qkv(info, query, key, value);
}
else
{
// Query: (batch_size, q_sequence_length, hidden_size)
std::vector<int64_t> q_dims{
params.batch_size, params.q_sequence_length, num_heads, params.head_size};
query = info.add_instruction(make_op("reshape", {{"dims", q_dims}}), query);
key = args[1];
if(params.qkv_fomat == qkv_fomat_t::kv_packed)
{
// Packed KV: (batch_size, kv_sequence_length, num_heads, 2, head_size)
unpack_kv(info, key, value);
}
else
{
value = args[2];
if(params.qkv_fomat == qkv_fomat_t::q_k_v)
{
// Key: (batch_size, kv_sequence_length, hidden_size)
// Value: (batch_size, kv_sequence_length, hidden_size_v)
std::vector<int64_t> k_dims{
params.batch_size, params.kv_sequence_length, num_heads, params.head_size};
std::vector<int64_t> v_dims{params.batch_size,
params.kv_sequence_length,
num_heads,
params.head_size_v};
key = info.add_instruction(make_op("reshape", {{"dims", k_dims}}), key);
value = info.add_instruction(make_op("reshape", {{"dims", v_dims}}), value);
}
}
}
// Target shape: (batch_size, num_heads, sequence_length, head_size)
std::vector<int64_t> perm{0, 2, 1, 3};
query = info.add_instruction(make_op("transpose", {{"permutation", perm}}), query);
if(params.qkv_fomat != qkv_fomat_t::q_k_v_cross)
{
key = info.add_instruction(make_op("transpose", {{"permutation", perm}}), key);
value = info.add_instruction(make_op("transpose", {{"permutation", perm}}), value);
}
float scale = 1 / std::sqrt(params.head_size);
if(contains(info.attributes, "scale"))
scale = parser.parse_value(info.attributes.at("scale")).at<float>();
auto scale_literal = info.add_literal(
migraphx::literal{migraphx::shape{query->get_shape().type()}, {scale}});
auto key_transposed =
info.add_instruction(make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), key);
auto result = info.add_instruction(make_op("dot"), query, key_transposed);
result = info.add_common_op("mul", result, scale_literal);
result = info.add_instruction(make_op("softmax", {{"axis", -1}}), result);
result = info.add_instruction(make_op("dot"), result, value);
result = info.add_instruction(make_op("transpose", {{"permutation", perm}}), result);
result = info.add_instruction(
make_op(
"reshape",
{{"dims", {params.batch_size, params.q_sequence_length, params.hidden_size_v}}}),
result);
return result;
}
};
} // namespace onnx
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx