75
75
#include < unordered_set>
76
76
#include < utility>
77
77
#include < stdatomic.h>
78
+ #include < future>
78
79
#if (defined __ANDROID__) || (defined ANDROID)
79
80
#include " android/log.h"
80
81
#endif
@@ -815,6 +816,11 @@ struct ggml_backend_qnn_context {
815
816
QNN_INTERFACE_VER_TYPE raw_interface;
816
817
QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
817
818
struct qcom_socinfo socinfo;
819
+
820
+ std::unique_ptr<char []> work_data;
821
+ std::vector<std::future<void >> tasks;
822
+ size_t work_size = 0 ;
823
+ int n_threads = GGML_DEFAULT_N_THREADS;
818
824
} ;
819
825
820
826
// the following helper funcs are used to ensure every QNN tensor name is unique
@@ -2780,7 +2786,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
2780
2786
const uint32_t src1_rank = ggml_get_tensor_rank (src1);
2781
2787
2782
2788
if (tensor->op == GGML_OP_ADD) {
2783
- // dump_tensors_info (tensor);
2789
+ // dump_op_info (tensor);
2784
2790
if (!ggml_are_same_shape (src0, src1)) {
2785
2791
return false ;
2786
2792
}
@@ -2791,6 +2797,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
2791
2797
}
2792
2798
2793
2799
if (tensor->op == GGML_OP_MUL_MAT) {
2800
+ dump_op_info (tensor);
2794
2801
if (src0_rank != src1_rank) // make QNN SDK happy
2795
2802
return false ;
2796
2803
if (src0_rank < 2 ) // QNN's limitation, make QNN SDK happy
@@ -2800,17 +2807,18 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
2800
2807
if ((src1->ne [2 ] != src0->ne [2 ]) || (src1->ne [3 ] != src0->ne [3 ])) // make QNN SDK happy
2801
2808
return false ;
2802
2809
2803
- // TODO: support more data type in func ggml_qnn_mul_mat(...)
2804
- // src0: q4_0, q6_k, ...
2805
- // src1: f32
2806
- // dst : f32
2807
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
2808
- && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
2809
- && (src0->type == src1->type ) && (src0->type == tensor->type );
2810
+ if (2 != src0_rank) { // TODO: quantize src0 for 3D & 4D matrix
2811
+ return (src0->type == GGML_TYPE_F32)
2812
+ && (src1->type == GGML_TYPE_F32)
2813
+ && (tensor->type == GGML_TYPE_F32);
2814
+ } else {
2815
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q6_K)
2816
+ && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
2817
+ }
2810
2818
}
2811
2819
2812
2820
if (tensor->op == GGML_OP_MUL) {
2813
- // dump_tensors_info (tensor);
2821
+ // dump_op_info (tensor);
2814
2822
if ((src0_rank != 2 ) || (src1_rank != 2 )) // TODO: 3D and 4D matrix
2815
2823
return false ;
2816
2824
return (src0->type == GGML_TYPE_F32)
@@ -2870,7 +2878,9 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
2870
2878
p_tensor1 = ggml_qnn_create_compute_tensor (src1);
2871
2879
p_tensor2 = ggml_qnn_create_compute_tensor (dst);
2872
2880
}
2881
+ #if GGMLQNN_PRINT_OP_ADD_LOG
2873
2882
print_tensors_info (__func__, ctx, src0, src1, dst);
2883
+ #endif
2874
2884
2875
2885
// ensure QNN tensor has correct tensor type
2876
2886
QNN_VER_PTR (*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -2966,7 +2976,6 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
2966
2976
2967
2977
auto graph_item = std::make_tuple (graph_handle, ggml_op_add_tensors);
2968
2978
instance->_qnn_graph_map [graph_name] = graph_item;
2969
-
2970
2979
} else {
2971
2980
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
2972
2981
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
@@ -3039,22 +3048,31 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
3039
3048
QNN_VER_PTR (*p_tensor0)->dimensions = tensor_0_dimensions;
3040
3049
QNN_VER_PTR (*p_tensor1)->dimensions = tensor_1_dimensions;
3041
3050
QNN_VER_PTR (*p_tensor2)->dimensions = tensor_2_dimensions;
3051
+
3052
+ #if GGMLQNN_PRINT_OP_ADD_LOG
3042
3053
op_perf.info ();
3054
+ #endif
3043
3055
}
3044
3056
3045
3057
/*
3046
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3047
- * than ggml_qnn_general_node.
3048
- * matrix transpose and type trait are required for offload mulmat to QNN backend,
3049
- * so it's a standalone function. accordingly, this is another typical skeleton for offload other
3050
- * ggml ops to QNN backend
3058
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
3059
+ * using the QNN backend. this function performs matrix multiplication of the input tensor
3060
+ * `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
3061
+ * and stores the result in the destination tensor `dst`.
3051
3062
*
3052
- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
3063
+ * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
3064
+ * QNN backend operations.
3065
+ * @param op the destination tensor where the result of the matrix multiplication will be stored.
3053
3066
*
3054
- * have three kinds of MUL_MAT to compute:
3055
- * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3056
- * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3057
- * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
3067
+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
3068
+ * than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
3069
+ * typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
3070
+ * time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
3071
+ * of MUL_MAT to compute:
3072
+ * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3073
+ * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3074
+ * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, Q6_K...)
3075
+ * and src1 is F32, src0 -> f32 in src0', then src0' * src1
3058
3076
*/
3059
3077
static void ggml_qnn_mul_mat (ggml_backend_t backend, ggml_tensor * op) {
3060
3078
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -3077,10 +3095,72 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3077
3095
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface ;
3078
3096
op_perf.start ();
3079
3097
3080
- uint32_t src0_rank = ggml_get_tensor_rank (src0);
3081
- uint32_t src1_rank = ggml_get_tensor_rank (src1);
3098
+ const enum ggml_type type = src0->type ;
3099
+ const uint32_t src0_rank = ggml_get_tensor_rank (src0);
3100
+ const uint32_t src1_rank = ggml_get_tensor_rank (src1);
3101
+
3102
+ GGML_TENSOR_BINARY_OP_LOCALS
3103
+ GGML_ASSERT (ne0 == ne01);
3104
+ GGML_ASSERT (ne1 == ne11);
3105
+ GGML_ASSERT (ne2 == ne12);
3106
+ GGML_ASSERT (ne3 == ne13);
3107
+ GGML_ASSERT (nb00 == ggml_type_size (type));
3108
+ GGML_ASSERT (nb10 == ggml_type_size (src1->type ));
3109
+
3082
3110
GGML_ASSERT (src0_rank == src1_rank);
3083
- GGML_ASSERT (src0_rank >= 2 ); // QNN SDK's limitation
3111
+ GGML_ASSERT (src0_rank >= 2 ); // QNN SDK's limitation, make QNN SDK happy
3112
+
3113
+ // broadcast factors
3114
+ const int64_t r2 = ne12 / ne02;
3115
+ const int64_t r3 = ne13 / ne03;
3116
+ const int64_t ne_plane = ne01 * ne00;
3117
+ const size_t desired_size = ((GGML_TYPE_F32 == type) ? 0 : ne03 * ne02 * ne_plane * sizeof (float ));
3118
+ if (ctx->work_size < desired_size) {
3119
+ ctx->work_data .reset (new char [desired_size]);
3120
+ ctx->work_size = desired_size;
3121
+ }
3122
+ void * wdata = ctx->work_data .get ();
3123
+ // convert src0 to float
3124
+ if (type != GGML_TYPE_F32) {
3125
+ const auto * type_traits = ggml_get_type_traits (type);
3126
+ ggml_to_float_t const to_float = type_traits->to_float ;
3127
+
3128
+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
3129
+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
3130
+ const void * x = (char *)src0->data + i02 * nb02 + i03 * nb03;
3131
+ float * const wplane = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
3132
+
3133
+ const int min_cols_per_thread = 4096 ;
3134
+ const int min_rows_per_thread = std::max ((int )(min_cols_per_thread / ne00), 1 );
3135
+ const int n_threads = std::max (std::min (ctx->n_threads , (int )(ne01 / min_rows_per_thread)), 1 );
3136
+ for (int i = 1 ; i < n_threads; i++) {
3137
+ const int64_t start = i * ne01 / n_threads;
3138
+ const int64_t end = (i + 1 ) * ne01 / n_threads;
3139
+ if (start < end) {
3140
+ ctx->tasks .push_back (std::async (std::launch::async, [=]() {
3141
+ for (int64_t i01 = start; i01 < end; i01++) {
3142
+ to_float ((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3143
+ }
3144
+ }));
3145
+ }
3146
+ }
3147
+ {
3148
+ // reuse the current thread for the first task
3149
+ const int64_t start = 0 ;
3150
+ const int64_t end = ne01 / n_threads;
3151
+ for (int64_t i01 = start; i01 < end; i01++) {
3152
+ to_float ((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
3153
+ }
3154
+ }
3155
+ }
3156
+ }
3157
+
3158
+ // wait for all tasks to finish
3159
+ for (auto & task : ctx->tasks ) {
3160
+ task.get ();
3161
+ }
3162
+ ctx->tasks .clear ();
3163
+ }
3084
3164
3085
3165
std::string graph_name;
3086
3166
get_graph_key_from_op (op, graph_name);
@@ -3133,9 +3213,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3133
3213
3134
3214
2. QNN's MatMul can only support input tensors with rank >= 2
3135
3215
3136
- there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
3216
+ in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
3217
+ operation when offloading mulmat to QNN backend. this concise implementation will handle
3218
+ transpose in func ggml_qnn_create_general_tensor()
3137
3219
*/
3138
-
3139
3220
// step-1: create qnn graph
3140
3221
error = qnn_raw_interface.graphCreate (instance->get_qnn_context_handle (),
3141
3222
graph_name.c_str (), nullptr , &graph_handle);
@@ -3158,8 +3239,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3158
3239
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor0));
3159
3240
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor1));
3160
3241
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor2));
3161
-
3162
- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3242
+ if (type != GGML_TYPE_F32) {
3243
+ QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
3244
+ } else {
3245
+ QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3246
+ }
3163
3247
QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
3164
3248
QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
3165
3249
@@ -3170,14 +3254,14 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3170
3254
// step-5: compose qnn graph: add mat_mul node
3171
3255
Qnn_Param_t out_0_params[] = {
3172
3256
{QNN_PARAMTYPE_SCALAR,
3173
- QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3174
- .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1 }
3257
+ QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
3258
+ .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1 }
3175
3259
}
3176
3260
};
3177
3261
3178
3262
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
3179
3263
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3180
- #if 0
3264
+ #if 0 //leave here for easily understand code, can be removed in the future
3181
3265
Qnn_OpConfig_t out_0 = {
3182
3266
QNN_OPCONFIG_VERSION_1, .v1 =
3183
3267
{"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3202,7 +3286,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3202
3286
};
3203
3287
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
3204
3288
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3205
- #if 0
3289
+ #if 0 //leave here for easily understand code, can be removed in the future
3206
3290
Qnn_OpConfig_t out_trans1_0 = {
3207
3291
QNN_OPCONFIG_VERSION_1,
3208
3292
.v1 = {"ggmlqnn_mulmat_transpose_opconfig",
@@ -3216,7 +3300,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3216
3300
};
3217
3301
#else
3218
3302
Qnn_OpConfig_t out_trans1_0 = create_op_config (" ggmlqnn_mulmat_transpose_opconfig" , QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
3219
- out_trans1_0_params, 1 , out_trans1_0_inputs, 1 , out_trans1_0_outputs, 1 );
3303
+ out_trans1_0_params, 1 , out_trans1_0_inputs, 1 , out_trans1_0_outputs, 1 );
3220
3304
#endif
3221
3305
CHECK_QNN_API (error, qnn_raw_interface.graphAddNode (graph_handle,out_trans1_0));
3222
3306
@@ -3225,9 +3309,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3225
3309
Qnn_Tensor_t input_tensors_0[] = {*p_tensor0, *p_tensor1};
3226
3310
Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
3227
3311
CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
3228
- input_tensors_0, 2 ,
3229
- output_tensors_0, 1 ,
3230
- nullptr , nullptr ));
3312
+ input_tensors_0, 2 ,
3313
+ output_tensors_0, 1 ,
3314
+ nullptr , nullptr ));
3231
3315
3232
3316
qnn_tensors_t ggml_op_mulmat_tensors;
3233
3317
ggml_op_mulmat_tensors.reserve (5 );
@@ -3239,7 +3323,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3239
3323
auto graph_item = std::make_tuple (graph_handle, ggml_op_mulmat_tensors);
3240
3324
instance->_qnn_graph_map [graph_name] = graph_item;
3241
3325
} else {
3242
- QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3326
+ if (type != GGML_TYPE_F32) {
3327
+ QNN_VER_PTR (*p_tensor0)->clientBuf = {wdata, static_cast <uint32_t >(desired_size)};
3328
+ } else {
3329
+ QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3330
+ }
3243
3331
QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
3244
3332
QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
3245
3333
@@ -3250,13 +3338,13 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3250
3338
Qnn_Tensor_t tensor_outputs[] = {
3251
3339
*p_tensor2
3252
3340
};
3253
- // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
3254
- // QNN SDK, details could be found at
3255
- // https://github.com/kantv-ai /llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
3341
+ // this is the second technical approach or another pipeline of "how to utilize the Hexagon
3342
+ // NPU maximally" through QNN SDK, details could be found at
3343
+ // https://github.com/ggml-org /llama.cpp/pull/12049#issuecomment-2678308360
3256
3344
CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
3257
- tensor_inputs, 2 ,
3258
- tensor_outputs, 1 ,
3259
- nullptr , nullptr ));
3345
+ tensor_inputs, 2 ,
3346
+ tensor_outputs, 1 ,
3347
+ nullptr , nullptr ));
3260
3348
}
3261
3349
3262
3350
// restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
0 commit comments