@@ -104,6 +104,12 @@ struct ggml_backend_qnn_context;
104
104
static int free_qnn_tensor (Qnn_Tensor_t * tensor);
105
105
static enum ggml_status ggml_backend_qnn_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
106
106
static void ggmlqnn_log_internal (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
107
+ static Qnn_Tensor_t * ggml_qnn_create_general_tensor (const ggml_tensor * tensor, const char * name,
108
+ Qnn_TensorType_t qnn_tensor_type,
109
+ Qnn_DataType_t qnn_data_type,
110
+ uint32_t rank, uint32_t * dims,
111
+ void * data, uint32_t data_size,
112
+ bool b_transpose = false );
107
113
108
114
// =================================================================================================
109
115
// section-2: ggml-qnn internal troubleshooting function
@@ -163,6 +169,7 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
163
169
164
170
#define GGMLQNN_MEM_ADD (alignment ) (sizeof (size_t ) + alignment)
165
171
#define GGMLQNN_MEM_MASK (alignment ) ((uintptr_t )alignment - 1 )
172
+ #define GQCGT ggml_qnn_create_general_tensor
166
173
167
174
static intptr_t ggmlqnn_align_to (size_t alignment, intptr_t offset) {
168
175
return offset % alignment == 0 ? offset
@@ -1013,6 +1020,20 @@ static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
1013
1020
}
1014
1021
}
1015
1022
1023
+ // helper function to create an operation config
1024
+ static Qnn_OpConfig_t create_op_config (const char * name, const char * package, const char * type,
1025
+ Qnn_Param_t * params, uint32_t num_params,
1026
+ Qnn_Tensor_t * inputs, uint32_t num_inputs,
1027
+ Qnn_Tensor_t * outputs, uint32_t num_outputs) {
1028
+ Qnn_OpConfigV1_t v1 = {name, package, type,
1029
+ num_params, params,
1030
+ num_inputs, inputs,
1031
+ num_outputs, outputs
1032
+ };
1033
+
1034
+ return (Qnn_OpConfig_t){QNN_OPCONFIG_VERSION_1, .v1 = v1};
1035
+ }
1036
+
1016
1037
// =================================================================================================
1017
1038
// section-5:ggml-qnn backend helper macro / data structure / function / class
1018
1039
// =================================================================================================
@@ -1469,10 +1490,32 @@ static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
1469
1490
return nullptr ;
1470
1491
}
1471
1492
1472
- static Qnn_Tensor_t * ggml_qnn_create_general_tensor (const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
1473
- Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
1474
- Qnn_ErrorHandle_t error = QNN_SUCCESS;
1475
- char tensor_name[GGML_MAX_NAME] = {0 };
1493
+ static void get_qnn_dimensions_from_ggml_dimensions (uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
1494
+ if (rank > GGML_MAX_DIMS) {
1495
+ GGMLQNN_LOG_WARN (" invalid params" );
1496
+ return ;
1497
+ }
1498
+ if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
1499
+ GGMLQNN_LOG_WARN (" invalid params" );
1500
+ return ;
1501
+ }
1502
+ for (size_t idx = 0 ; idx < GGML_MAX_DIMS; idx++)
1503
+ qnn_dimensions[idx] = ggml_dimensions[idx];
1504
+
1505
+ if (rank >= 2 ) {
1506
+ qnn_dimensions[rank - 1 ] = ggml_dimensions[rank - 2 ];
1507
+ qnn_dimensions[rank - 2 ] = ggml_dimensions[rank - 1 ];
1508
+ }
1509
+ }
1510
+
1511
+ static Qnn_Tensor_t * ggml_qnn_create_general_tensor (const ggml_tensor * tensor, const char * name,
1512
+ Qnn_TensorType_t qnn_tensor_type,
1513
+ Qnn_DataType_t qnn_data_type,
1514
+ uint32_t rank, uint32_t * dims,
1515
+ void * data, uint32_t data_size,
1516
+ bool b_transpose) {
1517
+ Qnn_ErrorHandle_t error = QNN_SUCCESS;
1518
+ char tensor_name[GGML_MAX_NAME] = {};
1476
1519
1477
1520
// ensure the tensor name is unique
1478
1521
if (nullptr != name) {
@@ -1483,19 +1526,36 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
1483
1526
GGMLQNN_LOG_DEBUG (" init_tensor %d" , get_idx ());
1484
1527
inc_idx ();
1485
1528
1486
- uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
1487
- uint32_t * tensor_dims = nullptr ;
1529
+ uint32_t reverse_dims[GGML_MAX_DIMS] = {};
1530
+ uint32_t transpose_dims[GGML_MAX_DIMS] = {};
1531
+ uint32_t * tensor_dims = nullptr ;
1532
+ // case 1:use dims info from ggml tensor
1488
1533
if (nullptr != tensor) {
1489
1534
// there are different dimension order between ggml tensor and qnn tensor
1490
1535
for (size_t idx = 0 ; idx < rank; idx++) {
1491
- dimensions_transpose [idx] = (uint32_t )tensor->ne [rank - 1 - idx];
1536
+ reverse_dims [idx] = (uint32_t )tensor->ne [rank - 1 - idx];
1492
1537
}
1493
- tensor_dims = dimensions_transpose ;
1538
+ tensor_dims = reverse_dims ;
1494
1539
}
1495
- // re-assign tensor_dims
1540
+ // case 2: use user's specified tensor_dims
1496
1541
if (nullptr != dims) {
1497
1542
tensor_dims = dims;
1498
1543
}
1544
+ // case 3: transpose for dst tensor
1545
+ if (b_transpose) {
1546
+ GGML_ASSERT (tensor != nullptr ); // ensure ggml_tensor is not nullptr for this special case
1547
+
1548
+ get_qnn_dimensions_from_ggml_dimensions (transpose_dims, reverse_dims, ggml_get_tensor_rank (tensor));
1549
+ tensor_dims = transpose_dims;
1550
+ #if 0
1551
+ for (size_t idx = 0; idx < 4; idx++) {
1552
+ GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
1553
+ }
1554
+ for (size_t idx = 0; idx < 4; idx++) {
1555
+ GGMLQNN_LOG_DEBUG("trans dim[%d]=%d\n", idx, transpose_dims[idx]);
1556
+ }
1557
+ #endif
1558
+ }
1499
1559
1500
1560
Qnn_Tensor_t qnn_tensor = {
1501
1561
.version = QNN_TENSOR_VERSION_1,
@@ -2989,25 +3049,6 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
2989
3049
print_tensors_info (nullptr , nullptr , src0, src1, dst);
2990
3050
}
2991
3051
2992
- // TODO: currently only support offloading 2D matrix to QNN backend
2993
- static void get_qnn_dimensions_from_ggml_dimensions (uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
2994
- if (rank > GGML_MAX_DIMS) {
2995
- GGMLQNN_LOG_WARN (" invalid params" );
2996
- return ;
2997
- }
2998
- if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
2999
- GGMLQNN_LOG_WARN (" invalid params" );
3000
- return ;
3001
- }
3002
- for (size_t idx = 0 ; idx < GGML_MAX_DIMS; idx++)
3003
- qnn_dimensions[idx] = ggml_dimensions[idx];
3004
-
3005
- if (rank >= 2 ) {
3006
- qnn_dimensions[rank - 1 ] = ggml_dimensions[rank - 2 ];
3007
- qnn_dimensions[rank - 2 ] = ggml_dimensions[rank - 1 ];
3008
- }
3009
- }
3010
-
3011
3052
// =================================================================================================
3012
3053
// section-6: implementation of ggml-qnn backend
3013
3054
// =================================================================================================
@@ -3056,10 +3097,9 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
3056
3097
}
3057
3098
3058
3099
if (tensor->op == GGML_OP_MUL_MAT) {
3059
- dump_op_info (tensor);
3060
3100
if (src0_rank != src1_rank) // make QNN SDK happy
3061
3101
return false ;
3062
- if (src0_rank < 2 ) // make QNN SDK happy
3102
+ if (src0_rank < 2 ) // QNN's limitation, make QNN SDK happy
3063
3103
return false ;
3064
3104
if (src0_rank > 3 ) // TODO: 4D matrix
3065
3105
return false ;
@@ -3327,7 +3367,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3327
3367
bool graph_initialized = false ;
3328
3368
qnn_perf op_perf = qnn_perf (" ggml_qnn_mul_mat" );
3329
3369
qnn_instance * instance = nullptr ;
3330
- ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context ;
3370
+ ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context ;
3331
3371
Qnn_GraphHandle_t graph_handle = nullptr ;
3332
3372
Qnn_Tensor_t * p_tensor0 = nullptr ;
3333
3373
Qnn_Tensor_t * p_tensor1 = nullptr ;
@@ -3361,11 +3401,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3361
3401
p_param_tensor = tensors[3 ];
3362
3402
p_tensor2_transpose = tensors[4 ];
3363
3403
} else {
3364
- p_tensor0 = ggml_qnn_create_general_tensor (src0, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3365
- p_tensor1 = ggml_qnn_create_general_tensor (src1, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3366
- p_tensor2 = ggml_qnn_create_general_tensor (dst, nullptr , QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3404
+ p_tensor0 = GQCGT (src0, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3405
+ p_tensor1 = GQCGT (src1, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3406
+ p_tensor2 = GQCGT (dst, nullptr , QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3367
3407
}
3368
-
3369
3408
print_tensors_info (__func__, ctx, src0, src1, dst);
3370
3409
3371
3410
// ensure QNN tensor has correct tensor type
@@ -3418,9 +3457,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3418
3457
{0 , 1 , 3 , 2 },
3419
3458
};
3420
3459
uint32_t param_tensor_dims[1 ] = {src0_rank};
3421
- p_param_tensor = ggml_qnn_create_general_tensor (nullptr , " param" , QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
3422
- 1 , param_tensor_dims,
3423
- (void *) (param_tensor_data[src0_rank - 1 ]), src0_rank * sizeof (uint32_t ));
3460
+ p_param_tensor = GQCGT (nullptr , " param" , QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1 , param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1 ]), src0_rank * sizeof (uint32_t ));
3424
3461
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_param_tensor));
3425
3462
3426
3463
// step-3: create compute tensor from ggml tensor
@@ -3433,13 +3470,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3433
3470
QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
3434
3471
3435
3472
// step-4: create a transpose tensor
3436
- uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
3437
- p_tensor2_transpose = ggml_qnn_create_general_tensor (dst, " transpose" , QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 );
3438
- get_qnn_dimensions_from_ggml_dimensions (tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank (dst));
3439
- // save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
3440
- uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR (*p_tensor2_transpose)->dimensions ;
3441
- // update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
3442
- QNN_VER_PTR (*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
3473
+ p_tensor2_transpose = GQCGT (dst, " transpose" , QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr , nullptr , 0 , true );
3443
3474
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_tensor2_transpose));
3444
3475
3445
3476
// step-5: compose qnn graph: add mat_mul node
@@ -3452,6 +3483,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3452
3483
3453
3484
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
3454
3485
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3486
+ #if 0
3455
3487
Qnn_OpConfig_t out_0 = {
3456
3488
QNN_OPCONFIG_VERSION_1, .v1 =
3457
3489
{"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3462,6 +3494,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3462
3494
1,
3463
3495
out_0_outputs}
3464
3496
};
3497
+ #else
3498
+ Qnn_OpConfig_t out_0 = create_op_config (" ggmlqnn_mulmat_opconfig" , QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
3499
+ out_0_params, 1 , out_0_inputs, 2 , out_0_outputs, 1 );
3500
+ #endif
3465
3501
CHECK_QNN_API (error, qnn_raw_interface.graphAddNode (graph_handle,out_0));
3466
3502
3467
3503
// step-5: compose qnn graph: add transpose node
@@ -3472,17 +3508,22 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3472
3508
};
3473
3509
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
3474
3510
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3511
+ #if 0
3475
3512
Qnn_OpConfig_t out_trans1_0 = {
3476
3513
QNN_OPCONFIG_VERSION_1,
3477
3514
.v1 = {"ggmlqnn_mulmat_transpose_opconfig",
3478
- " qti.aisw " ,
3515
+ QNN_OP_PACKAGE_NAME_QTI_AISW ,
3479
3516
QNN_OP_TRANSPOSE, 1,
3480
3517
out_trans1_0_params,
3481
3518
1,
3482
3519
out_trans1_0_inputs,
3483
3520
1,
3484
3521
out_trans1_0_outputs}
3485
3522
};
3523
+ #else
3524
+ Qnn_OpConfig_t out_trans1_0 = create_op_config (" ggmlqnn_mulmat_transpose_opconfig" , QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
3525
+ out_trans1_0_params, 1 , out_trans1_0_inputs, 1 , out_trans1_0_outputs, 1 );
3526
+ #endif
3486
3527
CHECK_QNN_API (error, qnn_raw_interface.graphAddNode (graph_handle,out_trans1_0));
3487
3528
3488
3529
// step-6: finalize qnn graph and execute qnn graph
@@ -3501,15 +3542,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3501
3542
ggml_op_mulmat_tensors.push_back (p_tensor2);
3502
3543
ggml_op_mulmat_tensors.push_back (p_param_tensor);
3503
3544
ggml_op_mulmat_tensors.push_back (p_tensor2_transpose);
3504
-
3505
3545
auto graph_item = std::make_tuple (graph_handle, ggml_op_mulmat_tensors);
3506
3546
instance->_qnn_graph_map [graph_name] = graph_item;
3507
-
3508
- // avoid cleanup these resource to make test_backend_ops happy
3509
- // free_qnn_tensor(p_param_tensor);
3510
- // restore pointer to avoid memory leak
3511
- QNN_VER_PTR (*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
3512
- // free_qnn_tensor(p_tensor2_transpose);
3513
3547
} else {
3514
3548
QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3515
3549
QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
@@ -3522,7 +3556,6 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3522
3556
Qnn_Tensor_t tensor_outputs[] = {
3523
3557
*p_tensor2
3524
3558
};
3525
- // attention:
3526
3559
// this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
3527
3560
// QNN SDK, details could be found at
3528
3561
// https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
0 commit comments