21
21
*/
22
22
23
23
#include " aclnn_ops.h"
24
+ #include " ggml-impl.h"
24
25
25
26
#include < aclnnop/aclnn_avgpool2d.h>
26
27
#include < aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
241
242
aclTensor* acl_src1 = ggml_cann_create_tensor (src1);
242
243
aclTensor* acl_dst = ggml_cann_create_tensor (dst);
243
244
244
- int64_t concat_dim = 1 ;
245
+ const int32_t dim = ggml_get_op_params_i32 (dst, 0 );
246
+
247
+ GGML_ASSERT (dim >= 0 && dim < 4 );
248
+ int32_t acl_dim = 3 - dim;
249
+
245
250
aclTensor* tensors[] = {acl_src0, acl_src1};
246
251
aclTensorList* tensorList = aclCreateTensorList (tensors, 2 );
247
- aclnn_concat (ctx, tensorList, acl_dst, concat_dim );
252
+ aclnn_concat (ctx, tensorList, acl_dst, acl_dim );
248
253
249
254
ACL_CHECK (aclDestroyTensorList (tensorList));
250
255
ACL_CHECK (aclDestroyTensor (acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1437
1442
ggml_tensor* src0 = dst->src [0 ]; // kernel
1438
1443
ggml_tensor* src1 = dst->src [1 ]; // input
1439
1444
1440
- GGML_ASSERT (src0->type == GGML_TYPE_F16);
1441
- GGML_ASSERT (src1->type == GGML_TYPE_F32);
1442
- GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1443
-
1444
1445
GGML_TENSOR_BINARY_OP_LOCALS;
1445
1446
1446
1447
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1462
1463
const int64_t OH = is_2D ? ne2 : 1 ;
1463
1464
const int64_t OW = ne1;
1464
1465
1465
- GGML_ASSERT (nb00 == sizeof (ggml_fp16_t ));
1466
- GGML_ASSERT (nb10 == sizeof (float ));
1467
-
1468
1466
// memory allocated increased to 3x when is_2D == false
1469
1467
const int64_t n_bytes_factor = is_2D ? 1 : 3 ;
1470
1468
@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2859
2857
ACL_CHECK (aclDestroyTensor (acl_cos_tensor));
2860
2858
}
2861
2859
2860
+ #ifdef __cplusplus
2861
+ extern " C" {
2862
+ #endif
2863
+ aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize (
2864
+ const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2865
+ int64_t mode, const aclTensor* yOut, uint64_t * workspaceSize,
2866
+ aclOpExecutor** executor);
2867
+ aclnnStatus aclnnRotaryPositionEmbedding (void * workspace,
2868
+ uint64_t workspaceSize,
2869
+ aclOpExecutor* executor,
2870
+ aclrtStream stream);
2871
+ #ifdef __cplusplus
2872
+ }
2873
+ #endif
2874
+
2862
2875
void ggml_cann_rope (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2863
2876
// TODO: use ascendc
2864
2877
// Only test with LLAMA model.
2865
2878
ggml_tensor* src0 = dst->src [0 ]; // input
2866
2879
ggml_tensor* src2 = dst->src [2 ]; // freq_factors
2867
2880
2868
- // TODO: with freq_factors
2869
- GGML_ASSERT (src2 == NULL );
2870
-
2871
2881
// param
2872
2882
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2873
2883
// const int n_past = ((int32_t *) dst->op_params)[0];
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2885
2895
memcpy (&beta_fast, (int32_t *)dst->op_params + 9 , sizeof (float ));
2886
2896
memcpy (&beta_slow, (int32_t *)dst->op_params + 10 , sizeof (float ));
2887
2897
2888
- GGML_ASSERT (n_dims <= ne0);
2898
+ // TODO: with freq_factors
2899
+ GGML_ASSERT (src2 == NULL );
2900
+ // TODO: attn_factor != 1
2901
+ GGML_ASSERT (attn_factor == 1 );
2902
+ // TODO: n_dims <= ne0
2903
+ GGML_ASSERT (n_dims == ne0);
2889
2904
GGML_ASSERT (n_dims % 2 == 0 );
2890
-
2891
2905
// TODO: ext_factor != 0
2892
2906
GGML_ASSERT (ext_factor == 0 );
2893
2907
// TODO: freq_scale != 1
2894
2908
GGML_ASSERT (freq_scale == 1 );
2909
+ // TODO: type == GGML_TYPE_F16
2910
+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
2895
2911
2896
2912
const float theta_scale = powf (freq_base, -2 .0f / n_dims);
2897
2913
@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2924
2940
aclnn_cache_init (ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2925
2941
theta_scale, is_neox);
2926
2942
2927
- // roll input
2928
- void * input_roll_buffer;
2929
- aclTensor* acl_minus_one_tensor;
2930
- void * minus_one_scale_buffer = nullptr ;
2931
- ggml_cann_pool_alloc roll_allocator (ctx.pool (), ggml_nbytes (src0));
2932
- ggml_cann_pool_alloc minus_one_scale_allocator (
2933
- ctx.pool (), sizeof (float_t ) * src0->ne [0 ]);
2934
- if (!is_neox) {
2935
- // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2936
- input_roll_buffer = roll_allocator.get ();
2937
- int64_t input_roll_ne[4 ] = {2 , src0->ne [1 ] * (src0->ne [0 ] / 2 ),
2938
- src0->ne [2 ], src0->ne [3 ]};
2939
- size_t input_roll_nb[GGML_MAX_DIMS];
2940
- input_roll_nb[0 ] = ggml_type_size (src0->type );
2941
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2942
- input_roll_nb[i] = input_roll_nb[i - 1 ] * input_roll_ne[i - 1 ];
2943
- }
2944
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor (
2945
- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
2946
- ggml_type_size (src0->type ), input_roll_ne, input_roll_nb,
2947
- GGML_MAX_DIMS);
2948
- aclTensor* acl_input_tensor = ggml_cann_create_tensor (
2949
- src0->data , ggml_cann_type_mapping (src0->type ),
2950
- ggml_type_size (src0->type ), input_roll_ne, input_roll_nb,
2951
- GGML_MAX_DIMS);
2952
-
2953
- int64_t shifts[] = {1 };
2954
- int64_t dims[] = {3 };
2955
- aclnn_roll (ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2956
- ACL_CHECK (aclDestroyTensor (acl_input_roll_tensor));
2957
- ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2958
-
2959
- // init [-1, 1, -1, 1, ...]
2960
- minus_one_scale_buffer = minus_one_scale_allocator.get ();
2961
-
2962
- int64_t minus_one_ne[4 ] = {src0->ne [0 ], 1 , 1 , 1 };
2963
- size_t minus_one_nb[GGML_MAX_DIMS];
2964
- minus_one_nb[0 ] = sizeof (float_t );
2965
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2966
- minus_one_nb[i] = minus_one_nb[i - 1 ] * minus_one_ne[i - 1 ];
2967
- }
2968
- acl_minus_one_tensor = aclnn_ones (
2969
- ctx, minus_one_scale_buffer, sizeof (float_t ) * src0->ne [0 ],
2970
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), 1 );
2971
- int64_t dim = 3 ;
2972
- int64_t * index = new int64_t [src0->ne [0 ]];
2973
- for (int i = 0 ; i < src0->ne [0 ]; i++) {
2974
- index [i] = i / 2 * 2 ;
2975
- }
2976
- int64_t index_num = src0->ne [0 ];
2977
- float value = -1 ;
2978
- aclnn_index_fill_tensor (ctx, acl_minus_one_tensor, dim, index ,
2979
- index_num, value);
2980
- } else {
2981
- // roll input: [q0,q1,q2,...] ->
2982
- // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2983
- input_roll_buffer = roll_allocator.get ();
2984
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor (
2985
- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
2986
- ggml_type_size (src0->type ), src0->ne , src0->nb , GGML_MAX_DIMS);
2987
- aclTensor* acl_input_tensor = ggml_cann_create_tensor (src0);
2988
-
2989
- int64_t shifts[] = {src0->ne [0 ] / 2 };
2990
- int64_t dims[] = {3 };
2991
- aclnn_roll (ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2992
-
2993
- ACL_CHECK (aclDestroyTensor (acl_input_roll_tensor));
2994
- ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2943
+ uint64_t workspaceSize = 0 ;
2944
+ aclOpExecutor* executor;
2995
2945
2996
- // init [-1, -1, -1, 1, 1,1,...]
2997
- minus_one_scale_buffer = minus_one_scale_allocator.get ();
2946
+ void * workspaceAddr = nullptr ;
2998
2947
2999
- int64_t minus_one_ne[4 ] = {src0->ne [0 ], 1 , 1 , 1 };
3000
- size_t minus_one_nb[GGML_MAX_DIMS];
3001
- minus_one_nb[0 ] = sizeof (float_t );
3002
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3003
- minus_one_nb[i] = minus_one_nb[i - 1 ] * minus_one_ne[i - 1 ];
3004
- }
3005
- acl_minus_one_tensor = aclnn_ones (
3006
- ctx, minus_one_scale_buffer, sizeof (float_t ) * src0->ne [0 ],
3007
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), 1 );
3008
- // -1 * first half
3009
- int64_t first_half_ne[4 ] = {src0->ne [0 ] / 2 , 1 , 1 , 1 };
3010
- size_t first_half_nb[GGML_MAX_DIMS];
3011
- first_half_nb[0 ] = sizeof (float_t );
3012
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3013
- first_half_nb[i] = first_half_nb[i - 1 ] * first_half_ne[i - 1 ];
3014
- }
3015
- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor (
3016
- minus_one_scale_buffer, ACL_FLOAT, sizeof (float_t ), first_half_ne,
3017
- first_half_nb, GGML_MAX_DIMS);
3018
- bool inplace = true ;
3019
- float scale = -1 ;
3020
- aclnn_muls (ctx, acl_first_half_tensor, scale, nullptr , inplace);
3021
- ACL_CHECK (aclDestroyTensor (acl_first_half_tensor));
3022
- }
3023
-
3024
- // TODO: n_dims < ne0
3025
- GGML_ASSERT (n_dims == src0->ne [0 ]);
3026
-
3027
- // input * scale
3028
- ggml_cann_pool_alloc roll_mul_scale_allocator (ctx.pool (),
3029
- ggml_nbytes (src0));
3030
- void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get ();
3031
- size_t input_nb[GGML_MAX_DIMS];
3032
- input_nb[0 ] = ggml_type_size (src0->type );
3033
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3034
- input_nb[i] = input_nb[i - 1 ] * src0->ne [i - 1 ];
2948
+ int acl_mode = mode;
2949
+ if (mode == 0 ) {
2950
+ acl_mode = 1 ;
3035
2951
}
3036
- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor (
3037
- input_roll_mul_scale_buffer, ggml_cann_type_mapping (src0->type ),
3038
- ggml_type_size (src0->type ), src0->ne , input_nb, GGML_MAX_DIMS);
3039
- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor (
3040
- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
3041
- ggml_type_size (src0->type ), src0->ne , input_nb, GGML_MAX_DIMS);
3042
2952
3043
- aclnn_mul (ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
3044
- acl_input_roll_mul_scale_tensor);
3045
-
3046
- // output
3047
- aclTensor* acl_src0 = ggml_cann_create_tensor (src0);
2953
+ aclTensor* acl_x = ggml_cann_create_tensor (src0);
3048
2954
aclTensor* acl_dst = ggml_cann_create_tensor (dst);
3049
- void * output_fp32_buffer;
3050
- if (src0->type == GGML_TYPE_F32) {
3051
- aclnn_inplace_mul (ctx, acl_src0, acl_cos_reshape_tensor);
3052
- aclnn_inplace_mul (ctx, acl_input_roll_mul_scale_tensor,
3053
- acl_sin_reshape_tensor);
3054
- aclnn_add (ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
3055
- // TODO: ne0 != n_dims in mode2
3056
- } else if (src0->type == GGML_TYPE_F16) {
3057
- size_t input_fp32_nb[GGML_MAX_DIMS];
3058
- input_fp32_nb[0 ] = sizeof (float_t );
3059
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3060
- input_fp32_nb[i] = input_fp32_nb[i - 1 ] * dst->ne [i - 1 ];
3061
- }
3062
- ggml_cann_pool_alloc fp32_allocator1 (
3063
- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3064
- void * input_fp32_buffer1 = fp32_allocator1.get ();
3065
- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor (
3066
- input_fp32_buffer1, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3067
- input_fp32_nb, GGML_MAX_DIMS);
3068
- ggml_cann_pool_alloc fp32_allocator2 (
3069
- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3070
- void * input_fp32_buffer2 = fp32_allocator2.get ();
3071
- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor (
3072
- input_fp32_buffer2, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3073
- input_fp32_nb, GGML_MAX_DIMS);
3074
-
3075
- ggml_cann_pool_alloc fp32_allocator (
3076
- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3077
- output_fp32_buffer = fp32_allocator.get ();
3078
- aclTensor* output_fp32_tensor = ggml_cann_create_tensor (
3079
- output_fp32_buffer, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3080
- input_fp32_nb, GGML_MAX_DIMS);
3081
- aclnn_mul (ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
3082
- aclnn_mul (ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
3083
- input_fp32_tensor2);
3084
- aclnn_add (ctx, input_fp32_tensor1, input_fp32_tensor2,
3085
- output_fp32_tensor);
3086
- aclnn_cast (ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3087
-
3088
- ACL_CHECK (aclDestroyTensor (input_fp32_tensor1));
3089
- ACL_CHECK (aclDestroyTensor (input_fp32_tensor2));
3090
- ACL_CHECK (aclDestroyTensor (output_fp32_tensor));
2955
+ ACL_CHECK (aclnnRotaryPositionEmbeddingGetWorkspaceSize (
2956
+ acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
2957
+ if (workspaceSize > 0 ) {
2958
+ ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize);
2959
+ workspaceAddr = workspace_allocator.get ();
3091
2960
}
3092
2961
3093
- ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
2962
+ ACL_CHECK (aclnnRotaryPositionEmbedding (workspaceAddr, workspaceSize,
2963
+ executor, ctx.stream ()));
2964
+
2965
+ ACL_CHECK (aclDestroyTensor (acl_x));
3094
2966
ACL_CHECK (aclDestroyTensor (acl_cos_reshape_tensor));
3095
- ACL_CHECK (aclDestroyTensor (acl_minus_one_tensor));
3096
- ACL_CHECK (aclDestroyTensor (acl_input_roll_mul_scale_tensor));
3097
- ACL_CHECK (aclDestroyTensor (acl_input_roll_reshape_tensor));
3098
- ACL_CHECK (aclDestroyTensor (acl_src0));
2967
+ ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
3099
2968
ACL_CHECK (aclDestroyTensor (acl_dst));
3100
2969
}
0 commit comments