6
6
#include < aclnnop/aclnn_copy.h>
7
7
#include < aclnnop/aclnn_cos.h>
8
8
#include < aclnnop/aclnn_exp.h>
9
+ #include < aclnnop/aclnn_fill_scalar.h>
9
10
#include < aclnnop/aclnn_group_norm.h>
10
11
#include < aclnnop/aclnn_layer_norm.h>
11
12
#include < aclnnop/aclnn_max_pool.h>
12
13
#include < aclnnop/aclnn_permute.h>
14
+ #include < aclnnop/aclnn_pow_tensor_tensor.h>
13
15
#include < aclnnop/aclnn_reduce_sum.h>
14
16
#include < aclnnop/aclnn_repeat.h>
15
17
#include < aclnnop/aclnn_sin.h>
@@ -60,6 +62,30 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
60
62
ACL_CHECK (aclDestroyTensor (acl_dst));
61
63
}
62
64
65
+ void aclnn_add (ggml_backend_cann_context& ctx, aclTensor *acl_src0,
66
+ aclTensor *acl_src1, aclTensor *acl_dst,
67
+ ggml_tensor* bind_tensor) {
68
+
69
+ aclScalar* alpha = nullptr ;
70
+ float alphaValue = 1 .0f ;
71
+ alpha = aclCreateScalar (&alphaValue, aclDataType::ACL_FLOAT);
72
+
73
+ uint64_t workspaceSize = 0 ;
74
+ aclOpExecutor* executor;
75
+ void * workspaceAddr = nullptr ;
76
+
77
+ ACL_CHECK (aclnnAddGetWorkspaceSize (acl_src0, acl_src1, alpha, acl_dst,
78
+ &workspaceSize, &executor));
79
+ if (workspaceSize > 0 ) {
80
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
81
+ }
82
+
83
+ aclrtStream main_stream = ctx.stream ();
84
+ ACL_CHECK (aclnnAdd (workspaceAddr, workspaceSize, executor, main_stream));
85
+
86
+ ACL_CHECK (aclDestroyScalar (alpha));
87
+ }
88
+
63
89
void ggml_cann_add (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
64
90
ggml_tensor* src0 = dst->src [0 ];
65
91
ggml_tensor* src1 = dst->src [1 ];
@@ -81,24 +107,8 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
81
107
acl_dst = create_acl_tensor (dst);
82
108
}
83
109
84
- aclScalar* alpha = nullptr ;
85
- float alphaValue = 1 .0f ;
86
- alpha = aclCreateScalar (&alphaValue, aclDataType::ACL_FLOAT);
87
-
88
- uint64_t workspaceSize = 0 ;
89
- aclOpExecutor* executor;
90
- void * workspaceAddr = nullptr ;
110
+ aclnn_add (ctx, acl_src0, acl_src1, acl_dst, dst);
91
111
92
- ACL_CHECK (aclnnAddGetWorkspaceSize (acl_src0, acl_src1, alpha, acl_dst,
93
- &workspaceSize, &executor));
94
- if (workspaceSize > 0 ) {
95
- workspaceAddr = ctx.alloc_buffer (dst, workspaceSize);
96
- }
97
-
98
- aclrtStream main_stream = ctx.stream ();
99
- ACL_CHECK (aclnnAdd (workspaceAddr, workspaceSize, executor, main_stream));
100
-
101
- ACL_CHECK (aclDestroyScalar (alpha));
102
112
ACL_CHECK (aclDestroyTensor (acl_src0));
103
113
ACL_CHECK (aclDestroyTensor (acl_src1));
104
114
ACL_CHECK (aclDestroyTensor (acl_dst));
@@ -1158,7 +1168,8 @@ void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1158
1168
}
1159
1169
1160
1170
void aclnn_noinplcace_mul (ggml_backend_cann_context& ctx, aclTensor *acl_src,
1161
- aclTensor *acl_other, aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1171
+ aclTensor *acl_other, aclTensor *acl_dst,
1172
+ ggml_tensor* bind_tensor) {
1162
1173
1163
1174
uint64_t workspaceSize = 0 ;
1164
1175
aclOpExecutor* executor;
@@ -1208,7 +1219,8 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1208
1219
ctx.stream ()));
1209
1220
}
1210
1221
1211
- void ggml_cann_timestep_embedding (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1222
+ void ggml_cann_timestep_embedding (ggml_backend_cann_context& ctx,
1223
+ ggml_tensor* dst) {
1212
1224
const ggml_tensor* src = dst->src [0 ];
1213
1225
1214
1226
GGML_ASSERT (src->type == GGML_TYPE_F32);
@@ -1253,7 +1265,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
1253
1265
tmp_permute_nb[i] = tmp_permute_nb[i-1 ] * tmp_permute_ne[i-1 ];
1254
1266
}
1255
1267
1256
- void * tmp_permute_buffer = ctx.alloc_buffer (dst, ggml_nbytes (src)* 320 );
1268
+ void * tmp_permute_buffer = ctx.alloc_buffer (dst, ggml_nbytes (src));
1257
1269
aclTensor* tmp_permute_tenosr = create_acl_tensor (tmp_permute_buffer,
1258
1270
type_mapping (src->type ),
1259
1271
ggml_type_size (src->type ),
@@ -1323,12 +1335,252 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
1323
1335
aclnn_concat (ctx, tensorList, acl_dst, concat_dim, dst);
1324
1336
1325
1337
// release
1338
+ // segmentation fault when delete both tensorList and his elements.
1326
1339
ACL_CHECK (aclDestroyTensorList (tensorList));
1327
1340
ACL_CHECK (aclDestroyTensor (acl_src));
1328
1341
ACL_CHECK (aclDestroyTensor (tmp_arange_tensor));
1329
1342
ACL_CHECK (aclDestroyTensor (tmp_permute_tenosr));
1330
1343
ACL_CHECK (aclDestroyTensor (tmp_mul_tensor));
1331
- ACL_CHECK (aclDestroyTensor (tmp_cos_tensor));
1332
- ACL_CHECK (aclDestroyTensor (tmp_sin_tensor));
1333
1344
ACL_CHECK (aclDestroyTensor (acl_dst));
1345
+ }
1346
+
1347
+ void aclnn_fill_scalar (ggml_backend_cann_context& ctx, float scalar,
1348
+ aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1349
+
1350
+ auto acl_scalar = aclCreateScalar (&scalar, aclDataType::ACL_FLOAT);
1351
+
1352
+ uint64_t workspaceSize = 0 ;
1353
+ aclOpExecutor* executor;
1354
+ void * workspaceAddr = nullptr ;
1355
+
1356
+ ACL_CHECK (aclnnInplaceFillScalarGetWorkspaceSize (acl_dst, acl_scalar,
1357
+ &workspaceSize,
1358
+ &executor));
1359
+ if (workspaceSize > 0 ) {
1360
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1361
+ }
1362
+
1363
+ ACL_CHECK (aclnnInplaceFillScalar (workspaceAddr, workspaceSize, executor,
1364
+ ctx.stream ()));
1365
+ ACL_CHECK (aclDestroyScalar (acl_scalar));
1366
+ }
1367
+
1368
+ void aclnn_pow_tensor_tensor (ggml_backend_cann_context& ctx,aclTensor *acl_dst,
1369
+ aclTensor *acl_exp, ggml_tensor* bind_tensor) {
1370
+
1371
+ uint64_t workspaceSize = 0 ;
1372
+ aclOpExecutor* executor;
1373
+ void * workspaceAddr = nullptr ;
1374
+
1375
+ ACL_CHECK (aclnnInplacePowTensorTensorGetWorkspaceSize (acl_dst, acl_exp,
1376
+ &workspaceSize,
1377
+ &executor));
1378
+ if (workspaceSize > 0 ) {
1379
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
1380
+ }
1381
+
1382
+ ACL_CHECK (aclnnInplacePowTensorTensor (workspaceAddr, workspaceSize,
1383
+ executor, ctx.stream ()));
1384
+ }
1385
+
1386
+ void ggml_cann_alibi (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1387
+ ggml_tensor* src = dst->src [0 ];
1388
+
1389
+ const int n_head = ((int32_t *) dst->op_params )[1 ];
1390
+ float max_bias;
1391
+ memcpy (&max_bias, (int32_t *) dst->op_params + 2 , sizeof (float ));
1392
+
1393
+ const int64_t ne0 = src->ne [0 ]; // all_seq_len = n_past + ne1
1394
+ const int64_t ne1 = src->ne [1 ]; // seq_len_without_past
1395
+ const int64_t ne2 = src->ne [2 ]; // n_head -> this is k
1396
+ const int64_t ne3 = src->ne [3 ]; // batch
1397
+
1398
+ const int64_t n = ggml_nrows (src);
1399
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
1400
+
1401
+ const size_t nb0 = src->nb [0 ];
1402
+
1403
+ GGML_ASSERT (nb0 == sizeof (float ));
1404
+ GGML_ASSERT (n_head == ne2);
1405
+
1406
+ // add alibi to src (KQ_scaled)
1407
+ const int n_heads_log2_floor = 1 << (int ) floor (log2 (n_head));
1408
+
1409
+ const float m0 = powf (2 .0f , -(max_bias) / n_heads_log2_floor);
1410
+ const float m1 = powf (2 .0f , -(max_bias / 2 .0f ) / n_heads_log2_floor);
1411
+
1412
+ // init arange
1413
+ void * tmp_arange_buffer = ctx.alloc_buffer (dst, ne2_ne3 *
1414
+ ggml_type_size (dst->type ));
1415
+ size_t memset_size = ne2_ne3 * ggml_type_size (dst->type );
1416
+ ACL_CHECK (aclrtMemset (tmp_arange_buffer, memset_size, 0 , memset_size));
1417
+
1418
+ // arange1: [1, ..., n_heads_log2_floor+1)
1419
+ float start = 1 ;
1420
+ float stop = n_heads_log2_floor + 1 ;
1421
+ float step = 1 ;
1422
+ int64_t n_elements_arange = n_heads_log2_floor;
1423
+
1424
+ int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
1425
+ size_t tmp_arange1_nb[] = {sizeof (dst->type )};
1426
+ aclTensor* tmp_arange1_tensor = create_acl_tensor (tmp_arange_buffer,
1427
+ type_mapping (dst->type ),
1428
+ ggml_type_size (dst->type ),
1429
+ tmp_arange1_ne,
1430
+ tmp_arange1_nb,
1431
+ GGML_MAX_DIMS-3 ,
1432
+ ACL_FORMAT_ND);
1433
+
1434
+ aclnn_arange (ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange,
1435
+ dst);
1436
+
1437
+ aclTensor* tmp_arange2_tensor = nullptr ;
1438
+ if (n_heads_log2_floor < ne2_ne3) {
1439
+ // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
1440
+ start = 1 ;
1441
+ stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1 ;
1442
+ step = 2 ;
1443
+ n_elements_arange = ne2_ne3 - n_heads_log2_floor;
1444
+ int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1445
+ size_t tmp_arange2_nb[] = {sizeof (dst->type )};
1446
+
1447
+ aclTensor* tmp_arange2_tensor = create_acl_tensor (tmp_arange_buffer +
1448
+ n_heads_log2_floor *
1449
+ ggml_type_size (
1450
+ dst->type ),
1451
+ type_mapping (
1452
+ dst->type ),
1453
+ ggml_type_size (
1454
+ dst->type ),
1455
+ tmp_arange2_ne,
1456
+ tmp_arange2_nb,
1457
+ GGML_MAX_DIMS-3 ,
1458
+ ACL_FORMAT_ND);
1459
+ aclnn_arange (ctx, tmp_arange2_tensor, start, stop, step,
1460
+ n_elements_arange, dst);
1461
+ }
1462
+
1463
+ // init mk_base
1464
+ void * tmp_mk_base_buffer = ctx.alloc_buffer (dst, ne2_ne3 *
1465
+ ggml_type_size (dst->type ));
1466
+ int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
1467
+ size_t tmp_mk_base1_nb[] = {sizeof (dst->type )};
1468
+ aclTensor* tmp_mk_base1_tensor = create_acl_tensor (tmp_mk_base_buffer,
1469
+ type_mapping (dst->type ),
1470
+ ggml_type_size (
1471
+ dst->type ),
1472
+ tmp_mk_base1_ne,
1473
+ tmp_mk_base1_nb,
1474
+ GGML_MAX_DIMS-3 ,
1475
+ ACL_FORMAT_ND);
1476
+
1477
+ aclnn_fill_scalar (ctx, m0, tmp_mk_base1_tensor, dst);
1478
+
1479
+ aclTensor* tmp_mk_base2_tensor = nullptr ;
1480
+ if (n_heads_log2_floor < ne2_ne3) {
1481
+ int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1482
+ size_t tmp_mk_base2_nb[] = {sizeof (dst->type )};
1483
+ aclTensor* tmp_mk_base2_tensor = create_acl_tensor (tmp_mk_base_buffer +
1484
+ n_heads_log2_floor *
1485
+ ggml_type_size (
1486
+ dst->type ),
1487
+ type_mapping (
1488
+ dst->type ),
1489
+ ggml_type_size (
1490
+ dst->type ),
1491
+ tmp_mk_base2_ne,
1492
+ tmp_mk_base2_nb,
1493
+ GGML_MAX_DIMS-3 ,
1494
+ ACL_FORMAT_ND);
1495
+ aclnn_fill_scalar (ctx, m1, tmp_mk_base2_tensor, dst);
1496
+ }
1497
+
1498
+ // init mk
1499
+ int64_t tmp_mk_base_ne[] = {ne2_ne3};
1500
+ size_t tmp_mk_base_nb[] = {sizeof (dst->type )};
1501
+ aclTensor* tmp_mk_base_tensor = create_acl_tensor (tmp_mk_base_buffer,
1502
+ type_mapping (dst->type ),
1503
+ ggml_type_size (dst->type ),
1504
+ tmp_mk_base_ne,
1505
+ tmp_mk_base_nb,
1506
+ GGML_MAX_DIMS-3 ,
1507
+ ACL_FORMAT_ND);
1508
+ aclTensor* tmp_arange_tensor = create_acl_tensor (tmp_arange_buffer,
1509
+ type_mapping (dst->type ),
1510
+ ggml_type_size (dst->type ),
1511
+ tmp_mk_base_ne,
1512
+ tmp_mk_base_nb,
1513
+ GGML_MAX_DIMS-3 ,
1514
+ ACL_FORMAT_ND);
1515
+ aclnn_pow_tensor_tensor (ctx, tmp_mk_base_tensor, tmp_arange_tensor, dst);
1516
+
1517
+ // reshape mk
1518
+ int64_t tmp_mk_ne[] = {1 , 1 , ne2, ne3};
1519
+ size_t tmp_mk_nb[GGML_MAX_DIMS];
1520
+ tmp_mk_nb[0 ] = ggml_type_size (dst->type );
1521
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1522
+ tmp_mk_nb[i] = tmp_mk_nb[i-1 ] * tmp_mk_ne[i-1 ];
1523
+ }
1524
+ aclTensor* tmp_mk_tensor = create_acl_tensor (tmp_mk_base_buffer,
1525
+ type_mapping (dst->type ),
1526
+ ggml_type_size (dst->type ),
1527
+ tmp_mk_ne,
1528
+ tmp_mk_nb,
1529
+ GGML_MAX_DIMS,
1530
+ ACL_FORMAT_ND);
1531
+
1532
+ // arange: [0, ..., ne0)
1533
+ start = 0 ;
1534
+ stop = ne0;
1535
+ step = 1 ;
1536
+ n_elements_arange = ne0;
1537
+ int64_t tmp_arange3_ne[] = {ne0, 1 , 1 , 1 };
1538
+ size_t tmp_arange3_nb[] = {sizeof (dst->type )};
1539
+
1540
+ void * tmp_arange3_buffer = ctx.alloc_buffer (dst, ne0 * sizeof (dst->type ));
1541
+ aclTensor* tmp_arange3_tensor = create_acl_tensor (tmp_arange3_buffer,
1542
+ type_mapping (dst->type ),
1543
+ ggml_type_size (dst->type ),
1544
+ tmp_arange3_ne,
1545
+ tmp_arange3_nb,
1546
+ GGML_MAX_DIMS,
1547
+ ACL_FORMAT_ND);
1548
+
1549
+ aclnn_arange (ctx, tmp_arange3_tensor, start, stop, step, n_elements_arange,
1550
+ dst);
1551
+
1552
+ // arange3 * mk
1553
+ int64_t tmp_output_ne[] = {ne0, 1 , ne2, ne3};
1554
+ size_t tmp_output_nb[GGML_MAX_DIMS];
1555
+ tmp_output_nb[0 ] = ggml_type_size (dst->type );
1556
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1557
+ tmp_output_nb[i] = tmp_output_nb[i-1 ] * tmp_output_ne[i-1 ];
1558
+ }
1559
+ void * tmp_output_buffer = ctx.alloc_buffer (dst, ggml_nbytes (dst));
1560
+ aclTensor* tmp_output_tensor = create_acl_tensor (tmp_output_buffer,
1561
+ type_mapping (dst->type ),
1562
+ ggml_type_size (dst->type ),
1563
+ tmp_output_ne,
1564
+ tmp_output_nb,
1565
+ GGML_MAX_DIMS,
1566
+ ACL_FORMAT_ND);
1567
+ aclnn_noinplcace_mul (ctx, tmp_arange3_tensor, tmp_mk_tensor,
1568
+ tmp_output_tensor, dst);
1569
+
1570
+ // add
1571
+ aclTensor* acl_src = create_acl_tensor (src);
1572
+ aclTensor* acl_dst = create_acl_tensor (dst);
1573
+ aclnn_add (ctx, tmp_output_tensor, acl_src, acl_dst, dst);
1574
+
1575
+ ACL_CHECK (aclDestroyTensor (acl_src));
1576
+ ACL_CHECK (aclDestroyTensor (acl_dst));
1577
+ ACL_CHECK (aclDestroyTensor (tmp_arange1_tensor));
1578
+ ACL_CHECK (aclDestroyTensor (tmp_arange2_tensor));
1579
+ ACL_CHECK (aclDestroyTensor (tmp_arange_tensor));
1580
+ ACL_CHECK (aclDestroyTensor (tmp_mk_base1_tensor));
1581
+ ACL_CHECK (aclDestroyTensor (tmp_mk_base2_tensor));
1582
+ ACL_CHECK (aclDestroyTensor (tmp_mk_base_tensor));
1583
+ ACL_CHECK (aclDestroyTensor (tmp_mk_tensor));
1584
+ ACL_CHECK (aclDestroyTensor (tmp_arange3_tensor));
1585
+ ACL_CHECK (aclDestroyTensor (tmp_output_tensor));
1334
1586
}
0 commit comments