4
4
#include < aclnnop/aclnn_cast.h>
5
5
#include < aclnnop/aclnn_constant_pad_nd.h>
6
6
#include < aclnnop/aclnn_copy.h>
7
+ #include < aclnnop/aclnn_cos.h>
8
+ #include < aclnnop/aclnn_exp.h>
7
9
#include < aclnnop/aclnn_group_norm.h>
8
10
#include < aclnnop/aclnn_layer_norm.h>
9
11
#include < aclnnop/aclnn_max_pool.h>
10
12
#include < aclnnop/aclnn_permute.h>
11
13
#include < aclnnop/aclnn_reduce_sum.h>
12
14
#include < aclnnop/aclnn_repeat.h>
15
+ #include < aclnnop/aclnn_sin.h>
13
16
#include < aclnnop/aclnn_softmax.h>
14
17
#include < aclnnop/aclnn_tril.h>
15
18
#include < aclnnop/aclnn_triu.h>
@@ -132,12 +135,9 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
132
135
ACL_CHECK (aclDestroyTensor (acl_dst));
133
136
}
134
137
135
- void ggml_cann_concat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
136
- ggml_tensor* src0 = dst->src [0 ];
137
- ggml_tensor* src1 = dst->src [1 ];
138
- aclTensor* acl_src0 = create_acl_tensor (src0);
139
- aclTensor* acl_src1 = create_acl_tensor (src1);
140
- aclTensor* acl_dst = create_acl_tensor (dst);
138
+ void aclnn_concat (ggml_backend_cann_context& ctx, aclTensor *acl_src0,
139
+ aclTensor *acl_src1, aclTensor *acl_dst, int64_t concat_dim,
140
+ ggml_tensor* bind_tensor) {
141
141
142
142
std::vector<aclTensor*> tmp{acl_src0, acl_src1};
143
143
aclTensorList* tensorList = aclCreateTensorList (tmp.data (), tmp.size ());
@@ -146,33 +146,44 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
146
146
aclOpExecutor* executor;
147
147
void * workspaceAddr = nullptr ;
148
148
149
- // dim1 == ne2, dims in llama.cpp is reversed.
150
- ACL_CHECK (aclnnCatGetWorkspaceSize (tensorList, 1 , acl_dst, &workspaceSize ,
151
- &executor));
149
+ // dims in llama.cpp is reversed.
150
+ ACL_CHECK (aclnnCatGetWorkspaceSize (tensorList, concat_dim , acl_dst,
151
+ &workspaceSize, & executor));
152
152
if (workspaceSize > 0 ) {
153
- workspaceAddr = ctx.alloc_buffer (dst , workspaceSize);
153
+ workspaceAddr = ctx.alloc_buffer (bind_tensor , workspaceSize);
154
154
}
155
155
156
156
aclrtStream main_stream = ctx.stream ();
157
157
ACL_CHECK (aclnnCat (workspaceAddr, workspaceSize, executor, main_stream));
158
158
159
- aclDestroyTensorList (tensorList);
160
- ACL_CHECK (aclDestroyTensor (acl_dst));
159
+ ACL_CHECK (aclDestroyTensorList (tensorList));
161
160
}
162
161
163
- void ggml_cann_arange (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
164
- GGML_ASSERT (dst->type == GGML_TYPE_F32);
162
+ void ggml_cann_concat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
163
+ ggml_tensor* src0 = dst->src [0 ];
164
+ ggml_tensor* src1 = dst->src [1 ];
165
+ aclTensor* acl_src0 = create_acl_tensor (src0);
166
+ aclTensor* acl_src1 = create_acl_tensor (src1);
167
+ aclTensor* acl_dst = create_acl_tensor (dst);
165
168
166
- float start;
167
- float stop;
168
- float step;
169
- memcpy (&start, (float *)dst->op_params + 0 , sizeof (float ));
170
- memcpy (&stop, (float *)dst->op_params + 1 , sizeof (float ));
171
- memcpy (&step, (float *)dst->op_params + 2 , sizeof (float ));
169
+ int64_t concat_dim = 1 ;
170
+
171
+ aclnn_concat (ctx, acl_src0, acl_src1, acl_dst, concat_dim, dst);
172
+
173
+ // release acl_src0, acl_src1 in aclnn_concat
174
+ // ACL_CHECK(aclDestroyTensor(acl_src0));
175
+ // ACL_CHECK(aclDestroyTensor(acl_src1));
176
+ // ->
177
+ // ACL_CHECK(aclDestroyTensorList(tensorList));
178
+ ACL_CHECK (aclDestroyTensor (acl_dst));
179
+ }
180
+
181
+ void aclnn_arange (ggml_backend_cann_context& ctx, aclTensor *acl_dst,
182
+ float start, float stop, float step, int64_t n_elements,
183
+ ggml_tensor* bind_tensor) {
172
184
173
185
int64_t steps = (int64_t )std::ceil ((stop - start) / step);
174
- GGML_ASSERT (ggml_nelements (dst) == steps);
175
- aclTensor* acl_dst = create_acl_tensor (dst);
186
+ GGML_ASSERT (n_elements == steps);
176
187
177
188
uint64_t workspaceSize = 0 ;
178
189
aclOpExecutor* executor;
@@ -185,7 +196,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
185
196
ACL_CHECK (aclnnArangeGetWorkspaceSize (acl_start, acl_end, acl_step, acl_dst,
186
197
&workspaceSize, &executor));
187
198
if (workspaceSize > 0 ) {
188
- workspaceAddr = ctx.alloc_buffer (dst , workspaceSize);
199
+ workspaceAddr = ctx.alloc_buffer (bind_tensor , workspaceSize);
189
200
}
190
201
191
202
aclrtStream main_stream = ctx.stream ();
@@ -194,6 +205,22 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
194
205
ACL_CHECK (aclDestroyScalar (acl_start));
195
206
ACL_CHECK (aclDestroyScalar (acl_end));
196
207
ACL_CHECK (aclDestroyScalar (acl_step));
208
+ }
209
+
210
+ void ggml_cann_arange (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
211
+ GGML_ASSERT (dst->type == GGML_TYPE_F32);
212
+
213
+ aclTensor* acl_dst = create_acl_tensor (dst);
214
+
215
+ int64_t n_elements = ggml_nelements (dst);
216
+ float start;
217
+ float stop;
218
+ float step;
219
+ memcpy (&start, (float *)dst->op_params + 0 , sizeof (float ));
220
+ memcpy (&stop, (float *)dst->op_params + 1 , sizeof (float ));
221
+ memcpy (&step, (float *)dst->op_params + 2 , sizeof (float ));
222
+
223
+ aclnn_arange (ctx, acl_dst, start, stop, step, n_elements, dst);
197
224
ACL_CHECK (aclDestroyTensor (acl_dst));
198
225
}
199
226
@@ -582,7 +609,9 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
582
609
0 , dst->ne [0 ] - src->ne [0 ], 0 , dst->ne [1 ] - src->ne [1 ],
583
610
0 , dst->ne [2 ] - src->ne [2 ], 0 , dst->ne [3 ] - src->ne [3 ]};
584
611
aclnn_pad (ctx, dst, acl_src, acl_dst, paddings);
612
+
585
613
ACL_CHECK (aclDestroyTensor (acl_dst));
614
+ ACL_CHECK (aclDestroyTensor (acl_src));
586
615
}
587
616
588
617
void ggml_cann_pool2d (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1078,3 +1107,232 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1078
1107
ACL_CHECK (aclDestroyIntArray (paddings));
1079
1108
ACL_CHECK (aclDestroyIntArray (strides));
1080
1109
}
1110
+
1111
+ void aclnn_exp (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1112
+ ggml_tensor* bind_tensor) {
1113
+
1114
+ uint64_t workspaceSize = 0 ;
1115
+ aclOpExecutor* executor;
1116
+ void * workspaceAddr = nullptr ;
1117
+
1118
+ ACL_CHECK (aclnnInplaceExpGetWorkspaceSize (acl_src, &workspaceSize,
1119
+ &executor));
1120
+ if (workspaceSize > 0 ) {
1121
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1122
+ }
1123
+
1124
+ ACL_CHECK (aclnnInplaceExp (workspaceAddr, workspaceSize, executor,
1125
+ ctx.stream ()));
1126
+
1127
+ }
1128
+
1129
+ void aclnn_muls (ggml_backend_cann_context& ctx, aclTensor *acl_src, float scale,
1130
+ ggml_tensor* bind_tensor) {
1131
+
1132
+ aclScalar* acl_scale = aclCreateScalar (&scale, aclDataType::ACL_FLOAT);
1133
+
1134
+ uint64_t workspaceSize = 0 ;
1135
+ aclOpExecutor* executor;
1136
+ void * workspaceAddr = nullptr ;
1137
+
1138
+ ACL_CHECK (aclnnInplaceMulsGetWorkspaceSize (acl_src, acl_scale,
1139
+ &workspaceSize, &executor));
1140
+ if (workspaceSize > 0 ) {
1141
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1142
+ }
1143
+
1144
+ ACL_CHECK (aclnnInplaceMuls (workspaceAddr, workspaceSize, executor,
1145
+ ctx.stream ()));
1146
+
1147
+ ACL_CHECK (aclDestroyScalar (acl_scale));
1148
+ }
1149
+
1150
+ void aclnn_inplace_mul (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1151
+ aclTensor *acl_other, ggml_tensor* bind_tensor) {
1152
+
1153
+ uint64_t workspaceSize = 0 ;
1154
+ aclOpExecutor* executor;
1155
+ void * workspaceAddr = nullptr ;
1156
+
1157
+ ACL_CHECK (aclnnInplaceMulGetWorkspaceSize (acl_src, acl_other,
1158
+ &workspaceSize, &executor));
1159
+ if (workspaceSize > 0 ) {
1160
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1161
+ }
1162
+
1163
+ ACL_CHECK (aclnnInplaceMul (workspaceAddr, workspaceSize, executor,
1164
+ ctx.stream ()));
1165
+ }
1166
+
1167
+ void aclnn_noinplcace_mul (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1168
+ aclTensor *acl_other, aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1169
+
1170
+ uint64_t workspaceSize = 0 ;
1171
+ aclOpExecutor* executor;
1172
+ void * workspaceAddr = nullptr ;
1173
+
1174
+ ACL_CHECK (aclnnMulGetWorkspaceSize (acl_src, acl_other, acl_dst,
1175
+ &workspaceSize, &executor));
1176
+ if (workspaceSize > 0 ) {
1177
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1178
+ }
1179
+
1180
+ ACL_CHECK (aclnnMul (workspaceAddr, workspaceSize, executor,
1181
+ ctx.stream ()));
1182
+ }
1183
+
1184
+ void aclnn_cos (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1185
+ aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1186
+
1187
+ uint64_t workspaceSize = 0 ;
1188
+ aclOpExecutor* executor;
1189
+ void * workspaceAddr = nullptr ;
1190
+
1191
+ ACL_CHECK (aclnnCosGetWorkspaceSize (acl_src, acl_dst, &workspaceSize,
1192
+ &executor));
1193
+ if (workspaceSize > 0 ) {
1194
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1195
+ }
1196
+
1197
+ ACL_CHECK (aclnnCos (workspaceAddr, workspaceSize, executor,
1198
+ ctx.stream ()));
1199
+ }
1200
+
1201
+ void aclnn_sin (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1202
+ aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1203
+
1204
+ uint64_t workspaceSize = 0 ;
1205
+ aclOpExecutor* executor;
1206
+ void * workspaceAddr = nullptr ;
1207
+
1208
+ ACL_CHECK (aclnnSinGetWorkspaceSize (acl_src, acl_dst, &workspaceSize,
1209
+ &executor));
1210
+ if (workspaceSize > 0 ) {
1211
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1212
+ }
1213
+
1214
+ ACL_CHECK (aclnnSin (workspaceAddr, workspaceSize, executor,
1215
+ ctx.stream ()));
1216
+ }
1217
+
1218
+ void ggml_cann_timestep_embedding (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1219
+ const ggml_tensor* src = dst->src [0 ];
1220
+
1221
+ GGML_ASSERT (src->type == GGML_TYPE_F32);
1222
+ GGML_ASSERT (dst->type == GGML_TYPE_F32);
1223
+
1224
+ const int dim = dst->op_params [0 ];
1225
+ const int max_period = dst->op_params [1 ];
1226
+ int half = dim / 2 ;
1227
+
1228
+ aclTensor* acl_src = create_acl_tensor (src);
1229
+
1230
+ // arange: [0, ..., half)
1231
+ float start = 0 ;
1232
+ float stop = half;
1233
+ float step = 1 ;
1234
+ int64_t n_elements_arange = half;
1235
+ int64_t tmp_arange_ne[] = {half};
1236
+ size_t tmp_arange_nb[] = {sizeof (dst->type )};
1237
+
1238
+ void * tmp_arange_buffer = ctx.alloc_buffer (dst, half * sizeof (dst->type ));
1239
+ aclTensor* tmp_arange_tensor = create_acl_tensor (tmp_arange_buffer,
1240
+ type_mapping (dst->type ),
1241
+ ggml_type_size (dst->type ),
1242
+ tmp_arange_ne,
1243
+ tmp_arange_nb,
1244
+ GGML_MAX_DIMS-3 ,
1245
+ ACL_FORMAT_ND);
1246
+
1247
+ aclnn_arange (ctx, tmp_arange_tensor, start, stop, step, n_elements_arange,
1248
+ dst);
1249
+
1250
+ // freq
1251
+ float freq_param = -logf (max_period) / half;
1252
+ aclnn_muls (ctx, tmp_arange_tensor, freq_param, dst);
1253
+ aclnn_exp (ctx, tmp_arange_tensor, dst);
1254
+
1255
+ // permute: src [0,1,2,3]->[0,1,3,2]
1256
+ int64_t tmp_permute_ne[] = {src->ne [1 ], src->ne [0 ], src->ne [2 ], src->ne [3 ]};
1257
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
1258
+ tmp_permute_nb[0 ] = ggml_type_size (src->type );
1259
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1260
+ tmp_permute_nb[i] = tmp_permute_nb[i-1 ] * tmp_permute_ne[i-1 ];
1261
+ }
1262
+
1263
+ void * tmp_permute_buffer = ctx.alloc_buffer (dst, ggml_nbytes (src)*320 );
1264
+ aclTensor* tmp_permute_tenosr = create_acl_tensor (tmp_permute_buffer,
1265
+ type_mapping (src->type ),
1266
+ ggml_type_size (src->type ),
1267
+ tmp_permute_ne,
1268
+ tmp_permute_nb,
1269
+ GGML_MAX_DIMS,
1270
+ ACL_FORMAT_ND);
1271
+ int64_t permute_dim[] = {0 , 1 , 3 , 2 };
1272
+ int64_t num_dims = 4 ;
1273
+ aclnn_permute (ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims, dst);
1274
+
1275
+ // timestep * freq
1276
+ int64_t tmp_mul_ne[] = {src->ne [1 ] * half, src->ne [0 ], src->ne [2 ],
1277
+ src->ne [3 ]};
1278
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
1279
+ tmp_mul_nb[0 ] = ggml_type_size (src->type );
1280
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1281
+ tmp_mul_nb[i] = tmp_mul_nb[i-1 ] * tmp_mul_ne[i-1 ];
1282
+ }
1283
+
1284
+ int mul_nelements = src->ne [1 ] * half * src->ne [0 ] * src->ne [2 ] *
1285
+ src->ne [3 ];
1286
+
1287
+ void * tmp_mul_buffer = ctx.alloc_buffer (dst, mul_nelements *
1288
+ ggml_type_size (src->type ));
1289
+ aclTensor* tmp_mul_tensor = create_acl_tensor (tmp_mul_buffer,
1290
+ type_mapping (src->type ),
1291
+ ggml_type_size (src->type ),
1292
+ tmp_mul_ne,
1293
+ tmp_mul_nb,
1294
+ GGML_MAX_DIMS,
1295
+ ACL_FORMAT_ND);
1296
+ aclnn_noinplcace_mul (ctx, tmp_permute_tenosr, tmp_arange_tensor,
1297
+ tmp_mul_tensor, dst);
1298
+
1299
+ // cos
1300
+ void * tmp_cos_buffer = ctx.alloc_buffer (dst, mul_nelements *
1301
+ ggml_type_size (src->type ));
1302
+ aclTensor* tmp_cos_tensor = create_acl_tensor (tmp_cos_buffer,
1303
+ type_mapping (dst->type ),
1304
+ ggml_type_size (dst->type ),
1305
+ tmp_mul_ne,
1306
+ tmp_mul_nb,
1307
+ GGML_MAX_DIMS,
1308
+ ACL_FORMAT_ND);
1309
+
1310
+ aclnn_cos (ctx, tmp_mul_tensor, tmp_cos_tensor, dst);
1311
+
1312
+ // sin
1313
+ void * tmp_sin_buffer = ctx.alloc_buffer (dst, mul_nelements *
1314
+ ggml_type_size (src->type ));
1315
+ aclTensor* tmp_sin_tensor = create_acl_tensor (tmp_sin_buffer,
1316
+ type_mapping (dst->type ),
1317
+ ggml_type_size (dst->type ),
1318
+ tmp_mul_ne,
1319
+ tmp_mul_nb,
1320
+ GGML_MAX_DIMS,
1321
+ ACL_FORMAT_ND);
1322
+
1323
+ aclnn_sin (ctx, tmp_mul_tensor, tmp_sin_tensor, dst);
1324
+
1325
+ // concat
1326
+ int64_t concat_dim = 3 ;
1327
+ aclTensor* acl_dst = create_acl_tensor (dst);
1328
+ aclnn_concat (ctx, tmp_cos_tensor, tmp_sin_tensor, acl_dst, concat_dim, dst);
1329
+
1330
+ // release
1331
+ ACL_CHECK (aclDestroyTensor (acl_src));
1332
+ ACL_CHECK (aclDestroyTensor (tmp_arange_tensor));
1333
+ ACL_CHECK (aclDestroyTensor (tmp_permute_tenosr));
1334
+ ACL_CHECK (aclDestroyTensor (tmp_mul_tensor));
1335
+ ACL_CHECK (aclDestroyTensor (tmp_cos_tensor));
1336
+ ACL_CHECK (aclDestroyTensor (tmp_sin_tensor));
1337
+ ACL_CHECK (aclDestroyTensor (acl_dst));
1338
+ }
0 commit comments