Skip to content

Commit d5d110d

Browse files
committed
ggml-qnn: refine function ggml_qnn_create_general_tensor() to avoid complex/redundant pointer operation
1 parent 60ca941 commit d5d110d

File tree

1 file changed

+87
-54
lines changed

1 file changed

+87
-54
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 87 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ struct ggml_backend_qnn_context;
104104
static int free_qnn_tensor(Qnn_Tensor_t * tensor);
105105
static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
106106
static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
107+
static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
108+
Qnn_TensorType_t qnn_tensor_type,
109+
Qnn_DataType_t qnn_data_type,
110+
uint32_t rank, uint32_t * dims,
111+
void * data, uint32_t data_size,
112+
bool b_transpose = false);
107113

108114
// =================================================================================================
109115
// section-2: ggml-qnn internal troubleshooting function
@@ -163,6 +169,7 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
163169

164170
#define GGMLQNN_MEM_ADD(alignment) (sizeof (size_t) + alignment)
165171
#define GGMLQNN_MEM_MASK(alignment) ((uintptr_t)alignment - 1)
172+
#define GQCGT ggml_qnn_create_general_tensor
166173

167174
static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
168175
return offset % alignment == 0 ? offset
@@ -1013,6 +1020,20 @@ static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
10131020
}
10141021
}
10151022

1023+
// helper function to create an operation config
1024+
static Qnn_OpConfig_t create_op_config(const char * name, const char * package, const char * type,
1025+
Qnn_Param_t * params, uint32_t num_params,
1026+
Qnn_Tensor_t * inputs, uint32_t num_inputs,
1027+
Qnn_Tensor_t * outputs, uint32_t num_outputs) {
1028+
Qnn_OpConfigV1_t v1 = {name, package, type,
1029+
num_params, params,
1030+
num_inputs, inputs,
1031+
num_outputs, outputs
1032+
};
1033+
1034+
return (Qnn_OpConfig_t){QNN_OPCONFIG_VERSION_1, .v1 = v1};
1035+
}
1036+
10161037
// =================================================================================================
10171038
// section-5:ggml-qnn backend helper macro / data structure / function / class
10181039
// =================================================================================================
@@ -1469,10 +1490,32 @@ static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
14691490
return nullptr;
14701491
}
14711492

1472-
static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
1473-
Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
1474-
Qnn_ErrorHandle_t error = QNN_SUCCESS;
1475-
char tensor_name[GGML_MAX_NAME] = {0};
1493+
static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
1494+
if (rank > GGML_MAX_DIMS) {
1495+
GGMLQNN_LOG_WARN("invalid params");
1496+
return;
1497+
}
1498+
if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
1499+
GGMLQNN_LOG_WARN("invalid params");
1500+
return;
1501+
}
1502+
for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
1503+
qnn_dimensions[idx] = ggml_dimensions[idx];
1504+
1505+
if (rank >= 2) {
1506+
qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
1507+
qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
1508+
}
1509+
}
1510+
1511+
static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
1512+
Qnn_TensorType_t qnn_tensor_type,
1513+
Qnn_DataType_t qnn_data_type,
1514+
uint32_t rank, uint32_t * dims,
1515+
void * data, uint32_t data_size,
1516+
bool b_transpose) {
1517+
Qnn_ErrorHandle_t error = QNN_SUCCESS;
1518+
char tensor_name[GGML_MAX_NAME] = {};
14761519

14771520
//ensure the tensor name is unique
14781521
if (nullptr != name) {
@@ -1483,19 +1526,36 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
14831526
GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
14841527
inc_idx();
14851528

1486-
uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
1487-
uint32_t * tensor_dims = nullptr;
1529+
uint32_t reverse_dims[GGML_MAX_DIMS] = {};
1530+
uint32_t transpose_dims[GGML_MAX_DIMS] = {};
1531+
uint32_t * tensor_dims = nullptr;
1532+
//case 1:use dims info from ggml tensor
14881533
if (nullptr != tensor) {
14891534
//there are different dimension order between ggml tensor and qnn tensor
14901535
for (size_t idx = 0; idx < rank; idx++) {
1491-
dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
1536+
reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
14921537
}
1493-
tensor_dims = dimensions_transpose;
1538+
tensor_dims = reverse_dims;
14941539
}
1495-
//re-assign tensor_dims
1540+
//case 2: use user's specified tensor_dims
14961541
if (nullptr != dims) {
14971542
tensor_dims = dims;
14981543
}
1544+
//case 3: transpose for dst tensor
1545+
if (b_transpose) {
1546+
GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
1547+
1548+
get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_get_tensor_rank(tensor));
1549+
tensor_dims = transpose_dims;
1550+
#if 0
1551+
for (size_t idx = 0; idx < 4; idx++) {
1552+
GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
1553+
}
1554+
for (size_t idx = 0; idx < 4; idx++) {
1555+
GGMLQNN_LOG_DEBUG("trans dim[%d]=%d\n", idx, transpose_dims[idx]);
1556+
}
1557+
#endif
1558+
}
14991559

15001560
Qnn_Tensor_t qnn_tensor = {
15011561
.version= QNN_TENSOR_VERSION_1,
@@ -2989,25 +3049,6 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
29893049
print_tensors_info(nullptr, nullptr, src0, src1, dst);
29903050
}
29913051

2992-
//TODO: currently only support offloading 2D matrix to QNN backend
2993-
static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
2994-
if (rank > GGML_MAX_DIMS) {
2995-
GGMLQNN_LOG_WARN("invalid params");
2996-
return;
2997-
}
2998-
if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
2999-
GGMLQNN_LOG_WARN("invalid params");
3000-
return;
3001-
}
3002-
for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
3003-
qnn_dimensions[idx] = ggml_dimensions[idx];
3004-
3005-
if (rank >= 2) {
3006-
qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
3007-
qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
3008-
}
3009-
}
3010-
30113052
// =================================================================================================
30123053
// section-6: implementation of ggml-qnn backend
30133054
// =================================================================================================
@@ -3056,10 +3097,9 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
30563097
}
30573098

30583099
if (tensor->op == GGML_OP_MUL_MAT) {
3059-
dump_op_info(tensor);
30603100
if (src0_rank != src1_rank) // make QNN SDK happy
30613101
return false;
3062-
if (src0_rank < 2) // make QNN SDK happy
3102+
if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
30633103
return false;
30643104
if (src0_rank > 3) //TODO: 4D matrix
30653105
return false;
@@ -3327,7 +3367,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
33273367
bool graph_initialized = false;
33283368
qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat");
33293369
qnn_instance * instance = nullptr;
3330-
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
3370+
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context;
33313371
Qnn_GraphHandle_t graph_handle = nullptr;
33323372
Qnn_Tensor_t * p_tensor0 = nullptr;
33333373
Qnn_Tensor_t * p_tensor1 = nullptr;
@@ -3361,11 +3401,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
33613401
p_param_tensor = tensors[3];
33623402
p_tensor2_transpose = tensors[4];
33633403
} else {
3364-
p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3365-
p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3366-
p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3404+
p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3405+
p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3406+
p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
33673407
}
3368-
33693408
print_tensors_info(__func__, ctx, src0, src1, dst);
33703409

33713410
//ensure QNN tensor has correct tensor type
@@ -3418,9 +3457,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34183457
{0, 1, 3, 2},
34193458
};
34203459
uint32_t param_tensor_dims[1] = {src0_rank};
3421-
p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
3422-
1, param_tensor_dims,
3423-
(void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
3460+
p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
34243461
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
34253462

34263463
//step-3: create compute tensor from ggml tensor
@@ -3433,13 +3470,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34333470
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
34343471

34353472
//step-4: create a transpose tensor
3436-
uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
3437-
p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3438-
get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
3439-
//save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
3440-
uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
3441-
//update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
3442-
QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
3473+
p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
34433474
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
34443475

34453476
//step-5: compose qnn graph: add mat_mul node
@@ -3452,6 +3483,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34523483

34533484
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
34543485
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3486+
#if 0
34553487
Qnn_OpConfig_t out_0 = {
34563488
QNN_OPCONFIG_VERSION_1, .v1 =
34573489
{"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3462,6 +3494,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34623494
1,
34633495
out_0_outputs}
34643496
};
3497+
#else
3498+
Qnn_OpConfig_t out_0 = create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
3499+
out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
3500+
#endif
34653501
CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
34663502

34673503
//step-5: compose qnn graph: add transpose node
@@ -3472,17 +3508,22 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34723508
};
34733509
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
34743510
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3511+
#if 0
34753512
Qnn_OpConfig_t out_trans1_0 = {
34763513
QNN_OPCONFIG_VERSION_1,
34773514
.v1 = {"ggmlqnn_mulmat_transpose_opconfig",
3478-
"qti.aisw",
3515+
QNN_OP_PACKAGE_NAME_QTI_AISW,
34793516
QNN_OP_TRANSPOSE, 1,
34803517
out_trans1_0_params,
34813518
1,
34823519
out_trans1_0_inputs,
34833520
1,
34843521
out_trans1_0_outputs}
34853522
};
3523+
#else
3524+
Qnn_OpConfig_t out_trans1_0 = create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
3525+
out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
3526+
#endif
34863527
CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
34873528

34883529
//step-6: finalize qnn graph and execute qnn graph
@@ -3501,15 +3542,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
35013542
ggml_op_mulmat_tensors.push_back(p_tensor2);
35023543
ggml_op_mulmat_tensors.push_back(p_param_tensor);
35033544
ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
3504-
35053545
auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
35063546
instance->_qnn_graph_map[graph_name] = graph_item;
3507-
3508-
//avoid cleanup these resource to make test_backend_ops happy
3509-
//free_qnn_tensor(p_param_tensor);
3510-
//restore pointer to avoid memory leak
3511-
QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
3512-
//free_qnn_tensor(p_tensor2_transpose);
35133547
} else {
35143548
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
35153549
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
@@ -3522,7 +3556,6 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
35223556
Qnn_Tensor_t tensor_outputs[] = {
35233557
*p_tensor2
35243558
};
3525-
//attention:
35263559
// this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
35273560
// QNN SDK, details could be found at
35283561
// https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph

0 commit comments

Comments
 (0)