Skip to content

Commit a7637d3

Browse files
noemotiovonnoemotiovon
authored andcommitted
CANN: RoPE and CANCAT operator optimization (ggml-org#10488)
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
1 parent dc2dc79 commit a7637d3

File tree

2 files changed

+106
-187
lines changed

2 files changed

+106
-187
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 47 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
*/
2222

2323
#include "aclnn_ops.h"
24+
#include "ggml-impl.h"
2425

2526
#include <aclnnop/aclnn_avgpool2d.h>
2627
#include <aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
241242
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
242243
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
243244

244-
int64_t concat_dim = 1;
245+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
246+
247+
GGML_ASSERT(dim >= 0 && dim < 4);
248+
int32_t acl_dim = 3 - dim;
249+
245250
aclTensor* tensors[] = {acl_src0, acl_src1};
246251
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
247-
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
252+
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
248253

249254
ACL_CHECK(aclDestroyTensorList(tensorList));
250255
ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14371442
ggml_tensor* src0 = dst->src[0]; // kernel
14381443
ggml_tensor* src1 = dst->src[1]; // input
14391444

1440-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
1441-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1442-
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1443-
14441445
GGML_TENSOR_BINARY_OP_LOCALS;
14451446

14461447
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14621463
const int64_t OH = is_2D ? ne2 : 1;
14631464
const int64_t OW = ne1;
14641465

1465-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1466-
GGML_ASSERT(nb10 == sizeof(float));
1467-
14681466
// memory allocated increased to 3x when is_2D == false
14691467
const int64_t n_bytes_factor = is_2D ? 1 : 3;
14701468

@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
28592857
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
28602858
}
28612859

2860+
#ifdef __cplusplus
2861+
extern "C" {
2862+
#endif
2863+
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2864+
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2865+
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
2866+
aclOpExecutor** executor);
2867+
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
2868+
uint64_t workspaceSize,
2869+
aclOpExecutor* executor,
2870+
aclrtStream stream);
2871+
#ifdef __cplusplus
2872+
}
2873+
#endif
2874+
28622875
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
28632876
// TODO: use ascendc
28642877
// Only test with LLAMA model.
28652878
ggml_tensor* src0 = dst->src[0]; // input
28662879
ggml_tensor* src2 = dst->src[2]; // freq_factors
28672880

2868-
// TODO: with freq_factors
2869-
GGML_ASSERT(src2 == NULL);
2870-
28712881
// param
28722882
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
28732883
// const int n_past = ((int32_t *) dst->op_params)[0];
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
28852895
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
28862896
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
28872897

2888-
GGML_ASSERT(n_dims <= ne0);
2898+
// TODO: with freq_factors
2899+
GGML_ASSERT(src2 == NULL);
2900+
// TODO: attn_factor != 1
2901+
GGML_ASSERT(attn_factor == 1);
2902+
// TODO: n_dims <= ne0
2903+
GGML_ASSERT(n_dims == ne0);
28892904
GGML_ASSERT(n_dims % 2 == 0);
2890-
28912905
// TODO: ext_factor != 0
28922906
GGML_ASSERT(ext_factor == 0);
28932907
// TODO: freq_scale != 1
28942908
GGML_ASSERT(freq_scale == 1);
2909+
// TODO: type == GGML_TYPE_F16
2910+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
28952911

28962912
const float theta_scale = powf(freq_base, -2.0f / n_dims);
28972913

@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
29242940
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
29252941
theta_scale, is_neox);
29262942

2927-
// roll input
2928-
void* input_roll_buffer;
2929-
aclTensor* acl_minus_one_tensor;
2930-
void* minus_one_scale_buffer = nullptr;
2931-
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2932-
ggml_cann_pool_alloc minus_one_scale_allocator(
2933-
ctx.pool(), sizeof(float_t) * src0->ne[0]);
2934-
if (!is_neox) {
2935-
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2936-
input_roll_buffer = roll_allocator.get();
2937-
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
2938-
src0->ne[2], src0->ne[3]};
2939-
size_t input_roll_nb[GGML_MAX_DIMS];
2940-
input_roll_nb[0] = ggml_type_size(src0->type);
2941-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
2942-
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2943-
}
2944-
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2945-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
2946-
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2947-
GGML_MAX_DIMS);
2948-
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2949-
src0->data, ggml_cann_type_mapping(src0->type),
2950-
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2951-
GGML_MAX_DIMS);
2952-
2953-
int64_t shifts[] = {1};
2954-
int64_t dims[] = {3};
2955-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2956-
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2957-
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2958-
2959-
// init [-1, 1, -1, 1, ...]
2960-
minus_one_scale_buffer = minus_one_scale_allocator.get();
2961-
2962-
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2963-
size_t minus_one_nb[GGML_MAX_DIMS];
2964-
minus_one_nb[0] = sizeof(float_t);
2965-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
2966-
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2967-
}
2968-
acl_minus_one_tensor = aclnn_ones(
2969-
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2970-
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2971-
int64_t dim = 3;
2972-
int64_t* index = new int64_t[src0->ne[0]];
2973-
for (int i = 0; i < src0->ne[0]; i++) {
2974-
index[i] = i / 2 * 2;
2975-
}
2976-
int64_t index_num = src0->ne[0];
2977-
float value = -1;
2978-
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
2979-
index_num, value);
2980-
} else {
2981-
// roll input: [q0,q1,q2,...] ->
2982-
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2983-
input_roll_buffer = roll_allocator.get();
2984-
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2985-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
2986-
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
2987-
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
2988-
2989-
int64_t shifts[] = {src0->ne[0] / 2};
2990-
int64_t dims[] = {3};
2991-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2992-
2993-
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2994-
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2943+
uint64_t workspaceSize = 0;
2944+
aclOpExecutor* executor;
29952945

2996-
// init [-1, -1, -1, 1, 1,1,...]
2997-
minus_one_scale_buffer = minus_one_scale_allocator.get();
2946+
void* workspaceAddr = nullptr;
29982947

2999-
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
3000-
size_t minus_one_nb[GGML_MAX_DIMS];
3001-
minus_one_nb[0] = sizeof(float_t);
3002-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
3003-
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
3004-
}
3005-
acl_minus_one_tensor = aclnn_ones(
3006-
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
3007-
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
3008-
// -1 * first half
3009-
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
3010-
size_t first_half_nb[GGML_MAX_DIMS];
3011-
first_half_nb[0] = sizeof(float_t);
3012-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
3013-
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
3014-
}
3015-
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
3016-
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
3017-
first_half_nb, GGML_MAX_DIMS);
3018-
bool inplace = true;
3019-
float scale = -1;
3020-
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
3021-
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
3022-
}
3023-
3024-
// TODO: n_dims < ne0
3025-
GGML_ASSERT(n_dims == src0->ne[0]);
3026-
3027-
// input * scale
3028-
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
3029-
ggml_nbytes(src0));
3030-
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
3031-
size_t input_nb[GGML_MAX_DIMS];
3032-
input_nb[0] = ggml_type_size(src0->type);
3033-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
3034-
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
2948+
int acl_mode = mode;
2949+
if (mode == 0) {
2950+
acl_mode = 1;
30352951
}
3036-
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
3037-
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
3038-
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
3039-
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
3040-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
3041-
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
30422952

3043-
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
3044-
acl_input_roll_mul_scale_tensor);
3045-
3046-
// output
3047-
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
2953+
aclTensor* acl_x = ggml_cann_create_tensor(src0);
30482954
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
3049-
void* output_fp32_buffer;
3050-
if (src0->type == GGML_TYPE_F32) {
3051-
aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
3052-
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
3053-
acl_sin_reshape_tensor);
3054-
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
3055-
// TODO: ne0 != n_dims in mode2
3056-
} else if (src0->type == GGML_TYPE_F16) {
3057-
size_t input_fp32_nb[GGML_MAX_DIMS];
3058-
input_fp32_nb[0] = sizeof(float_t);
3059-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
3060-
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
3061-
}
3062-
ggml_cann_pool_alloc fp32_allocator1(
3063-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3064-
void* input_fp32_buffer1 = fp32_allocator1.get();
3065-
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
3066-
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
3067-
input_fp32_nb, GGML_MAX_DIMS);
3068-
ggml_cann_pool_alloc fp32_allocator2(
3069-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3070-
void* input_fp32_buffer2 = fp32_allocator2.get();
3071-
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
3072-
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
3073-
input_fp32_nb, GGML_MAX_DIMS);
3074-
3075-
ggml_cann_pool_alloc fp32_allocator(
3076-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3077-
output_fp32_buffer = fp32_allocator.get();
3078-
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
3079-
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
3080-
input_fp32_nb, GGML_MAX_DIMS);
3081-
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
3082-
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
3083-
input_fp32_tensor2);
3084-
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
3085-
output_fp32_tensor);
3086-
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3087-
3088-
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
3089-
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
3090-
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
2955+
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2956+
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
2957+
if (workspaceSize > 0) {
2958+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2959+
workspaceAddr = workspace_allocator.get();
30912960
}
30922961

3093-
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
2962+
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
2963+
executor, ctx.stream()));
2964+
2965+
ACL_CHECK(aclDestroyTensor(acl_x));
30942966
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3095-
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
3096-
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
3097-
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
3098-
ACL_CHECK(aclDestroyTensor(acl_src0));
2967+
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
30992968
ACL_CHECK(aclDestroyTensor(acl_dst));
31002969
}

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,12 +1669,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
16691669
}
16701670
case GGML_OP_MUL_MAT: {
16711671
switch (op->src[0]->type) {
1672-
case GGML_TYPE_F16:
1673-
case GGML_TYPE_F32:
16741672
case GGML_TYPE_Q8_0:
1675-
// TODO: fix me
16761673
// Current groupsize should not be greater than k-1 in
1677-
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
1674+
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1675+
if (op->src[0]->ne[0] <= QK8_0) {
1676+
return false;
1677+
}
1678+
case GGML_TYPE_F16:
1679+
case GGML_TYPE_F32:
16781680
case GGML_TYPE_Q4_0:
16791681
return true;
16801682
default:
@@ -1706,9 +1708,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17061708
return false;
17071709
}
17081710
}
1711+
case GGML_OP_CONT: {
1712+
// TODO: support GGML_TYPE_BF16
1713+
switch (op->src[0]->type) {
1714+
case GGML_TYPE_F32:
1715+
case GGML_TYPE_F16:
1716+
return true;
1717+
default:
1718+
return false;
1719+
}
1720+
}
1721+
case GGML_OP_ROPE: {
1722+
// TODO: with ops-test v == 1
1723+
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
1724+
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
1725+
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
1726+
// TODO: with freq_factors
1727+
if (op->src[2] != NULL) {
1728+
return false;
1729+
}
1730+
// TODO: n_dims <= ne0
1731+
if (op->src[0]->ne[0] != op->op_params[1]) {
1732+
return false;
1733+
}
1734+
// TODO: ext_factor != 0
1735+
if (*ext_factor != 0) {
1736+
return false;
1737+
}
1738+
// TODO: freq_scale != 1
1739+
if (*freq_scale != 1) {
1740+
return false;
1741+
}
1742+
// TODO: attn_factor != 1
1743+
if (*attn_factor != 1) {
1744+
return false;
1745+
}
1746+
//TODO: type == GGML_TYPE_F16
1747+
switch (op->src[0]->type) {
1748+
case GGML_TYPE_F32:
1749+
return true;
1750+
default:
1751+
return false;
1752+
}
1753+
}
1754+
case GGML_OP_UPSCALE: {
1755+
// aclnnUpsampleNearest2dGetWorkspaceSize not support
1756+
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
1757+
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1758+
return false;
1759+
}
1760+
return true;
1761+
}
1762+
case GGML_OP_IM2COL:
1763+
case GGML_OP_CONCAT:
17091764
case GGML_OP_DUP:
17101765
case GGML_OP_REPEAT:
1711-
case GGML_OP_CONCAT:
17121766
case GGML_OP_NONE:
17131767
case GGML_OP_RESHAPE:
17141768
case GGML_OP_VIEW:
@@ -1722,17 +1776,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17221776
case GGML_OP_SCALE:
17231777
case GGML_OP_SQR:
17241778
case GGML_OP_CLAMP:
1725-
case GGML_OP_CONT:
17261779
case GGML_OP_DIAG_MASK_INF:
17271780
case GGML_OP_SOFT_MAX:
1728-
case GGML_OP_ROPE:
1729-
case GGML_OP_IM2COL:
17301781
case GGML_OP_POOL_2D:
17311782
case GGML_OP_SUM_ROWS:
17321783
case GGML_OP_ARGSORT:
17331784
case GGML_OP_ACC:
17341785
case GGML_OP_GROUP_NORM:
1735-
case GGML_OP_UPSCALE:
17361786
case GGML_OP_PAD:
17371787
case GGML_OP_ARANGE:
17381788
case GGML_OP_TIMESTEP_EMBEDDING:

0 commit comments

Comments
 (0)