Skip to content

Commit d80b289

Browse files
committed
ggml-qnn: offload quantized type mulmat to QNN backend
1 parent 71dae47 commit d80b289

File tree

1 file changed

+130
-42
lines changed

1 file changed

+130
-42
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 130 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
#include <unordered_set>
7676
#include <utility>
7777
#include <stdatomic.h>
78+
#include <future>
7879
#if (defined __ANDROID__) || (defined ANDROID)
7980
#include "android/log.h"
8081
#endif
@@ -815,6 +816,11 @@ struct ggml_backend_qnn_context {
815816
QNN_INTERFACE_VER_TYPE raw_interface;
816817
QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
817818
struct qcom_socinfo socinfo;
819+
820+
std::unique_ptr<char[]> work_data;
821+
std::vector<std::future<void>> tasks;
822+
size_t work_size = 0;
823+
int n_threads = GGML_DEFAULT_N_THREADS;
818824
} ;
819825

820826
//the following helper funcs are used to ensure every QNN tensor name is unique
@@ -2780,7 +2786,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
27802786
const uint32_t src1_rank = ggml_get_tensor_rank(src1);
27812787

27822788
if (tensor->op == GGML_OP_ADD) {
2783-
//dump_tensors_info(tensor);
2789+
//dump_op_info(tensor);
27842790
if (!ggml_are_same_shape(src0, src1)) {
27852791
return false;
27862792
}
@@ -2791,6 +2797,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
27912797
}
27922798

27932799
if (tensor->op == GGML_OP_MUL_MAT) {
2800+
dump_op_info(tensor);
27942801
if (src0_rank != src1_rank) // make QNN SDK happy
27952802
return false;
27962803
if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
@@ -2800,17 +2807,18 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
28002807
if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
28012808
return false;
28022809

2803-
//TODO: support more data type in func ggml_qnn_mul_mat(...)
2804-
//src0: q4_0, q6_k, ...
2805-
//src1: f32
2806-
//dst : f32
2807-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
2808-
&& (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
2809-
&& (src0->type == src1->type) && (src0->type == tensor->type);
2810+
if (2 != src0_rank) { //TODO: quantize src0 for 3D & 4D matrix
2811+
return (src0->type == GGML_TYPE_F32)
2812+
&& (src1->type == GGML_TYPE_F32)
2813+
&& (tensor->type == GGML_TYPE_F32);
2814+
} else {
2815+
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q6_K)
2816+
&& (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
2817+
}
28102818
}
28112819

28122820
if (tensor->op == GGML_OP_MUL) {
2813-
//dump_tensors_info(tensor);
2821+
//dump_op_info(tensor);
28142822
if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
28152823
return false;
28162824
return (src0->type == GGML_TYPE_F32)
@@ -2870,7 +2878,9 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
28702878
p_tensor1 = ggml_qnn_create_compute_tensor(src1);
28712879
p_tensor2 = ggml_qnn_create_compute_tensor(dst);
28722880
}
2881+
#if GGMLQNN_PRINT_OP_ADD_LOG
28732882
print_tensors_info(__func__, ctx, src0, src1, dst);
2883+
#endif
28742884

28752885
//ensure QNN tensor has correct tensor type
28762886
QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -2966,7 +2976,6 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
29662976

29672977
auto graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
29682978
instance->_qnn_graph_map[graph_name] = graph_item;
2969-
29702979
} else {
29712980
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
29722981
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
@@ -3039,22 +3048,31 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
30393048
QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
30403049
QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
30413050
QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
3051+
3052+
#if GGMLQNN_PRINT_OP_ADD_LOG
30423053
op_perf.info();
3054+
#endif
30433055
}
30443056

30453057
/*
3046-
* the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3047-
* than ggml_qnn_general_node.
3048-
* matrix transpose and type trait are required for offload mulmat to QNN backend,
3049-
* so it's a standalone function. accordingly, this is another typical skeleton for offload other
3050-
* ggml ops to QNN backend
3058+
* @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
3059+
* using the QNN backend. this function performs matrix multiplication of the input tensor
3060+
* `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
3061+
* and stores the result in the destination tensor `dst`.
30513062
*
3052-
* MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
3063+
* @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
3064+
* QNN backend operations.
3065+
* @param op the destination tensor where the result of the matrix multiplication will be stored.
30533066
*
3054-
* have three kinds of MUL_MAT to compute:
3055-
* mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3056-
* mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3057-
* mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
3067+
* @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3068+
* than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
3069+
* typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
3070+
* time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
3071+
* of MUL_MAT to compute:
3072+
* mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3073+
* mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3074+
* mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, Q6_K...)
3075+
* and src1 is F32, src0 -> f32 in src0', then src0' * src1
30583076
*/
30593077
static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
30603078
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -3077,10 +3095,72 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
30773095
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
30783096
op_perf.start();
30793097

3080-
uint32_t src0_rank = ggml_get_tensor_rank(src0);
3081-
uint32_t src1_rank = ggml_get_tensor_rank(src1);
3098+
const enum ggml_type type = src0->type;
3099+
const uint32_t src0_rank = ggml_get_tensor_rank(src0);
3100+
const uint32_t src1_rank = ggml_get_tensor_rank(src1);
3101+
3102+
GGML_TENSOR_BINARY_OP_LOCALS
3103+
GGML_ASSERT(ne0 == ne01);
3104+
GGML_ASSERT(ne1 == ne11);
3105+
GGML_ASSERT(ne2 == ne12);
3106+
GGML_ASSERT(ne3 == ne13);
3107+
GGML_ASSERT(nb00 == ggml_type_size(type));
3108+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
3109+
30823110
GGML_ASSERT(src0_rank == src1_rank);
3083-
GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
3111+
GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
3112+
3113+
// broadcast factors
3114+
const int64_t r2 = ne12 / ne02;
3115+
const int64_t r3 = ne13 / ne03;
3116+
const int64_t ne_plane = ne01 * ne00;
3117+
const size_t desired_size = ((GGML_TYPE_F32 == type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
3118+
if (ctx->work_size < desired_size) {
3119+
ctx->work_data.reset(new char[desired_size]);
3120+
ctx->work_size = desired_size;
3121+
}
3122+
void * wdata = ctx->work_data.get();
3123+
// convert src0 to float
3124+
if (type != GGML_TYPE_F32) {
3125+
const auto * type_traits = ggml_get_type_traits(type);
3126+
ggml_to_float_t const to_float = type_traits->to_float;
3127+
3128+
for (int64_t i03 = 0; i03 < ne03; i03++) {
3129+
for (int64_t i02 = 0; i02 < ne02; i02++) {
3130+
const void * x = (char *)src0->data + i02 * nb02 + i03 * nb03;
3131+
float * const wplane = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
3132+
3133+
const int min_cols_per_thread = 4096;
3134+
const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
3135+
const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
3136+
for (int i = 1; i < n_threads; i++) {
3137+
const int64_t start = i * ne01 / n_threads;
3138+
const int64_t end = (i + 1) * ne01 / n_threads;
3139+
if (start < end) {
3140+
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
3141+
for (int64_t i01 = start; i01 < end; i01++) {
3142+
to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3143+
}
3144+
}));
3145+
}
3146+
}
3147+
{
3148+
// reuse the current thread for the first task
3149+
const int64_t start = 0;
3150+
const int64_t end = ne01 / n_threads;
3151+
for (int64_t i01 = start; i01 < end; i01++) {
3152+
to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3153+
}
3154+
}
3155+
}
3156+
}
3157+
3158+
// wait for all tasks to finish
3159+
for (auto & task : ctx->tasks) {
3160+
task.get();
3161+
}
3162+
ctx->tasks.clear();
3163+
}
30843164

30853165
std::string graph_name;
30863166
get_graph_key_from_op(op, graph_name);
@@ -3133,9 +3213,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31333213
31343214
2. QNN's MatMul can only support input tensors with rank >= 2
31353215
3136-
there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
3216+
in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
3217+
operation when offloading mulmat to QNN backend. this concise implementation will handle
3218+
transpose in func ggml_qnn_create_general_tensor()
31373219
*/
3138-
31393220
//step-1: create qnn graph
31403221
error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
31413222
graph_name.c_str(), nullptr, &graph_handle);
@@ -3158,8 +3239,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31583239
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
31593240
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
31603241
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
3161-
3162-
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3242+
if (type != GGML_TYPE_F32) {
3243+
QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
3244+
} else {
3245+
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3246+
}
31633247
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
31643248
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
31653249

@@ -3170,14 +3254,14 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31703254
//step-5: compose qnn graph: add mat_mul node
31713255
Qnn_Param_t out_0_params[] = {
31723256
{QNN_PARAMTYPE_SCALAR,
3173-
QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3174-
.scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
3257+
QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3258+
.scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
31753259
}
31763260
};
31773261

31783262
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
31793263
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3180-
#if 0
3264+
#if 0 //leave here for easily understand code, can be removed in the future
31813265
Qnn_OpConfig_t out_0 = {
31823266
QNN_OPCONFIG_VERSION_1, .v1 =
31833267
{"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3202,7 +3286,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32023286
};
32033287
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
32043288
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3205-
#if 0
3289+
#if 0 //leave here for easily understand code, can be removed in the future
32063290
Qnn_OpConfig_t out_trans1_0 = {
32073291
QNN_OPCONFIG_VERSION_1,
32083292
.v1 = {"ggmlqnn_mulmat_transpose_opconfig",
@@ -3216,7 +3300,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32163300
};
32173301
#else
32183302
Qnn_OpConfig_t out_trans1_0 = create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
3219-
out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
3303+
out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
32203304
#endif
32213305
CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
32223306

@@ -3225,9 +3309,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32253309
Qnn_Tensor_t input_tensors_0[] = {*p_tensor0, *p_tensor1};
32263310
Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
32273311
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
3228-
input_tensors_0, 2,
3229-
output_tensors_0, 1,
3230-
nullptr, nullptr));
3312+
input_tensors_0, 2,
3313+
output_tensors_0, 1,
3314+
nullptr, nullptr));
32313315

32323316
qnn_tensors_t ggml_op_mulmat_tensors;
32333317
ggml_op_mulmat_tensors.reserve(5);
@@ -3239,7 +3323,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32393323
auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
32403324
instance->_qnn_graph_map[graph_name] = graph_item;
32413325
} else {
3242-
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3326+
if (type != GGML_TYPE_F32) {
3327+
QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
3328+
} else {
3329+
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
3330+
}
32433331
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
32443332
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
32453333

@@ -3250,13 +3338,13 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32503338
Qnn_Tensor_t tensor_outputs[] = {
32513339
*p_tensor2
32523340
};
3253-
// this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
3254-
// QNN SDK, details could be found at
3255-
// https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
3341+
// this is the second technical approach or another pipeline of "how to utilize the Hexagon
3342+
// NPU maximally" through QNN SDK, details could be found at
3343+
// https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
32563344
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
3257-
tensor_inputs, 2,
3258-
tensor_outputs, 1,
3259-
nullptr, nullptr));
3345+
tensor_inputs, 2,
3346+
tensor_outputs, 1,
3347+
nullptr, nullptr));
32603348
}
32613349

32623350
// restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor

0 commit comments

Comments
 (0)