Skip to content

Commit a21434a

Browse files
committed
add alibi & fix double release in timestep_embedding
1 parent 5009d83 commit a21434a

File tree

4 files changed

+312
-23
lines changed

4 files changed

+312
-23
lines changed

ggml-cann.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
450450
break;
451451
case GGML_OP_ROPE:
452452
case GGML_OP_ALIBI:
453+
ggml_cann_alibi(ctx, dst);
454+
break;
453455
case GGML_OP_IM2COL:
454456
ggml_cann_im2col(ctx, dst);
455457
break;
@@ -685,8 +687,9 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
685687
return true;
686688
case GGML_OP_SOFT_MAX:
687689
case GGML_OP_ROPE:
688-
case GGML_OP_ALIBI:
689690
return false;
691+
case GGML_OP_ALIBI:
692+
return true;
690693
case GGML_OP_IM2COL:
691694
return true;
692695
case GGML_OP_POOL_2D:

ggml-cann/aclnn_ops.cpp

Lines changed: 274 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
#include <aclnnop/aclnn_copy.h>
77
#include <aclnnop/aclnn_cos.h>
88
#include <aclnnop/aclnn_exp.h>
9+
#include <aclnnop/aclnn_fill_scalar.h>
910
#include <aclnnop/aclnn_group_norm.h>
1011
#include <aclnnop/aclnn_layer_norm.h>
1112
#include <aclnnop/aclnn_max_pool.h>
1213
#include <aclnnop/aclnn_permute.h>
14+
#include <aclnnop/aclnn_pow_tensor_tensor.h>
1315
#include <aclnnop/aclnn_reduce_sum.h>
1416
#include <aclnnop/aclnn_repeat.h>
1517
#include <aclnnop/aclnn_sin.h>
@@ -60,6 +62,30 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
6062
ACL_CHECK(aclDestroyTensor(acl_dst));
6163
}
6264

65+
void aclnn_add(ggml_backend_cann_context& ctx, aclTensor *acl_src0,
66+
aclTensor *acl_src1, aclTensor *acl_dst,
67+
ggml_tensor* bind_tensor) {
68+
69+
aclScalar* alpha = nullptr;
70+
float alphaValue = 1.0f;
71+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
72+
73+
uint64_t workspaceSize = 0;
74+
aclOpExecutor* executor;
75+
void* workspaceAddr = nullptr;
76+
77+
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
78+
&workspaceSize, &executor));
79+
if (workspaceSize > 0) {
80+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
81+
}
82+
83+
aclrtStream main_stream = ctx.stream();
84+
ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream));
85+
86+
ACL_CHECK(aclDestroyScalar(alpha));
87+
}
88+
6389
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
6490
ggml_tensor* src0 = dst->src[0];
6591
ggml_tensor* src1 = dst->src[1];
@@ -81,24 +107,8 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
81107
acl_dst = create_acl_tensor(dst);
82108
}
83109

84-
aclScalar* alpha = nullptr;
85-
float alphaValue = 1.0f;
86-
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
87-
88-
uint64_t workspaceSize = 0;
89-
aclOpExecutor* executor;
90-
void* workspaceAddr = nullptr;
110+
aclnn_add(ctx, acl_src0, acl_src1, acl_dst, dst);
91111

92-
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
93-
&workspaceSize, &executor));
94-
if (workspaceSize > 0) {
95-
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
96-
}
97-
98-
aclrtStream main_stream = ctx.stream();
99-
ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream));
100-
101-
ACL_CHECK(aclDestroyScalar(alpha));
102112
ACL_CHECK(aclDestroyTensor(acl_src0));
103113
ACL_CHECK(aclDestroyTensor(acl_src1));
104114
ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1158,7 +1168,8 @@ void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor *acl_src,
11581168
}
11591169

11601170
void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1161-
aclTensor *acl_other, aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1171+
aclTensor *acl_other, aclTensor *acl_dst,
1172+
ggml_tensor* bind_tensor) {
11621173

11631174
uint64_t workspaceSize = 0;
11641175
aclOpExecutor* executor;
@@ -1208,7 +1219,8 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor *acl_src,
12081219
ctx.stream()));
12091220
}
12101221

1211-
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1222+
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1223+
ggml_tensor* dst) {
12121224
const ggml_tensor* src = dst->src[0];
12131225

12141226
GGML_ASSERT(src->type == GGML_TYPE_F32);
@@ -1253,7 +1265,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
12531265
tmp_permute_nb[i] = tmp_permute_nb[i-1] * tmp_permute_ne[i-1];
12541266
}
12551267

1256-
void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src)*320);
1268+
void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src));
12571269
aclTensor* tmp_permute_tenosr = create_acl_tensor(tmp_permute_buffer,
12581270
type_mapping(src->type),
12591271
ggml_type_size(src->type),
@@ -1323,12 +1335,252 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
13231335
aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst);
13241336

13251337
// release
1338+
// segmentation fault when delete both tensorList and his elements.
13261339
ACL_CHECK(aclDestroyTensorList(tensorList));
13271340
ACL_CHECK(aclDestroyTensor(acl_src));
13281341
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
13291342
ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
13301343
ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1331-
ACL_CHECK(aclDestroyTensor(tmp_cos_tensor));
1332-
ACL_CHECK(aclDestroyTensor(tmp_sin_tensor));
13331344
ACL_CHECK(aclDestroyTensor(acl_dst));
1345+
}
1346+
1347+
void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1348+
aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1349+
1350+
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1351+
1352+
uint64_t workspaceSize = 0;
1353+
aclOpExecutor* executor;
1354+
void* workspaceAddr = nullptr;
1355+
1356+
ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(acl_dst, acl_scalar,
1357+
&workspaceSize,
1358+
&executor));
1359+
if (workspaceSize > 0) {
1360+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1361+
}
1362+
1363+
ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
1364+
ctx.stream()));
1365+
ACL_CHECK(aclDestroyScalar(acl_scalar));
1366+
}
1367+
1368+
void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,aclTensor *acl_dst,
1369+
aclTensor *acl_exp, ggml_tensor* bind_tensor) {
1370+
1371+
uint64_t workspaceSize = 0;
1372+
aclOpExecutor* executor;
1373+
void* workspaceAddr = nullptr;
1374+
1375+
ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(acl_dst, acl_exp,
1376+
&workspaceSize,
1377+
&executor));
1378+
if (workspaceSize > 0) {
1379+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1380+
}
1381+
1382+
ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
1383+
executor, ctx.stream()));
1384+
}
1385+
1386+
void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1387+
ggml_tensor* src = dst->src[0];
1388+
1389+
const int n_head = ((int32_t*) dst->op_params)[1];
1390+
float max_bias;
1391+
memcpy(&max_bias, (int32_t*) dst->op_params + 2, sizeof(float));
1392+
1393+
const int64_t ne0 = src->ne[0]; // all_seq_len = n_past + ne1
1394+
const int64_t ne1 = src->ne[1]; // seq_len_without_past
1395+
const int64_t ne2 = src->ne[2]; // n_head -> this is k
1396+
const int64_t ne3 = src->ne[3]; // batch
1397+
1398+
const int64_t n = ggml_nrows(src);
1399+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
1400+
1401+
const size_t nb0 = src->nb[0];
1402+
1403+
GGML_ASSERT(nb0 == sizeof(float));
1404+
GGML_ASSERT(n_head == ne2);
1405+
1406+
// add alibi to src (KQ_scaled)
1407+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
1408+
1409+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
1410+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
1411+
1412+
// init arange
1413+
void* tmp_arange_buffer = ctx.alloc_buffer(dst, ne2_ne3 *
1414+
ggml_type_size(dst->type));
1415+
size_t memset_size = ne2_ne3 * ggml_type_size(dst->type);
1416+
ACL_CHECK(aclrtMemset(tmp_arange_buffer, memset_size, 0, memset_size));
1417+
1418+
// arange1: [1, ..., n_heads_log2_floor+1)
1419+
float start = 1;
1420+
float stop = n_heads_log2_floor + 1;
1421+
float step = 1;
1422+
int64_t n_elements_arange = n_heads_log2_floor;
1423+
1424+
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
1425+
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
1426+
aclTensor* tmp_arange1_tensor = create_acl_tensor(tmp_arange_buffer,
1427+
type_mapping(dst->type),
1428+
ggml_type_size(dst->type),
1429+
tmp_arange1_ne,
1430+
tmp_arange1_nb,
1431+
GGML_MAX_DIMS-3,
1432+
ACL_FORMAT_ND);
1433+
1434+
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange,
1435+
dst);
1436+
1437+
aclTensor* tmp_arange2_tensor = nullptr;
1438+
if (n_heads_log2_floor < ne2_ne3) {
1439+
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
1440+
start = 1;
1441+
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
1442+
step = 2;
1443+
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
1444+
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1445+
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
1446+
1447+
aclTensor* tmp_arange2_tensor = create_acl_tensor(tmp_arange_buffer +
1448+
n_heads_log2_floor *
1449+
ggml_type_size(
1450+
dst->type),
1451+
type_mapping(
1452+
dst->type),
1453+
ggml_type_size(
1454+
dst->type),
1455+
tmp_arange2_ne,
1456+
tmp_arange2_nb,
1457+
GGML_MAX_DIMS-3,
1458+
ACL_FORMAT_ND);
1459+
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
1460+
n_elements_arange, dst);
1461+
}
1462+
1463+
// init mk_base
1464+
void* tmp_mk_base_buffer = ctx.alloc_buffer(dst, ne2_ne3 *
1465+
ggml_type_size(dst->type));
1466+
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
1467+
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
1468+
aclTensor* tmp_mk_base1_tensor = create_acl_tensor(tmp_mk_base_buffer,
1469+
type_mapping(dst->type),
1470+
ggml_type_size(
1471+
dst->type),
1472+
tmp_mk_base1_ne,
1473+
tmp_mk_base1_nb,
1474+
GGML_MAX_DIMS-3,
1475+
ACL_FORMAT_ND);
1476+
1477+
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor, dst);
1478+
1479+
aclTensor* tmp_mk_base2_tensor = nullptr;
1480+
if (n_heads_log2_floor < ne2_ne3) {
1481+
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1482+
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
1483+
aclTensor* tmp_mk_base2_tensor = create_acl_tensor(tmp_mk_base_buffer +
1484+
n_heads_log2_floor *
1485+
ggml_type_size(
1486+
dst->type),
1487+
type_mapping(
1488+
dst->type),
1489+
ggml_type_size(
1490+
dst->type),
1491+
tmp_mk_base2_ne,
1492+
tmp_mk_base2_nb,
1493+
GGML_MAX_DIMS-3,
1494+
ACL_FORMAT_ND);
1495+
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor, dst);
1496+
}
1497+
1498+
// init mk
1499+
int64_t tmp_mk_base_ne[] = {ne2_ne3};
1500+
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
1501+
aclTensor* tmp_mk_base_tensor = create_acl_tensor(tmp_mk_base_buffer,
1502+
type_mapping(dst->type),
1503+
ggml_type_size(dst->type),
1504+
tmp_mk_base_ne,
1505+
tmp_mk_base_nb,
1506+
GGML_MAX_DIMS-3,
1507+
ACL_FORMAT_ND);
1508+
aclTensor* tmp_arange_tensor = create_acl_tensor(tmp_arange_buffer,
1509+
type_mapping(dst->type),
1510+
ggml_type_size(dst->type),
1511+
tmp_mk_base_ne,
1512+
tmp_mk_base_nb,
1513+
GGML_MAX_DIMS-3,
1514+
ACL_FORMAT_ND);
1515+
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor, dst);
1516+
1517+
// reshape mk
1518+
int64_t tmp_mk_ne[] = {1, 1, ne2, ne3};
1519+
size_t tmp_mk_nb[GGML_MAX_DIMS];
1520+
tmp_mk_nb[0] = ggml_type_size(dst->type);
1521+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
1522+
tmp_mk_nb[i] = tmp_mk_nb[i-1] * tmp_mk_ne[i-1];
1523+
}
1524+
aclTensor* tmp_mk_tensor = create_acl_tensor(tmp_mk_base_buffer,
1525+
type_mapping(dst->type),
1526+
ggml_type_size(dst->type),
1527+
tmp_mk_ne,
1528+
tmp_mk_nb,
1529+
GGML_MAX_DIMS,
1530+
ACL_FORMAT_ND);
1531+
1532+
// arange: [0, ..., ne0)
1533+
start = 0;
1534+
stop = ne0;
1535+
step = 1;
1536+
n_elements_arange = ne0;
1537+
int64_t tmp_arange3_ne[] = {ne0, 1, 1, 1};
1538+
size_t tmp_arange3_nb[] = {sizeof(dst->type)};
1539+
1540+
void* tmp_arange3_buffer = ctx.alloc_buffer(dst, ne0 * sizeof(dst->type));
1541+
aclTensor* tmp_arange3_tensor = create_acl_tensor(tmp_arange3_buffer,
1542+
type_mapping(dst->type),
1543+
ggml_type_size(dst->type),
1544+
tmp_arange3_ne,
1545+
tmp_arange3_nb,
1546+
GGML_MAX_DIMS,
1547+
ACL_FORMAT_ND);
1548+
1549+
aclnn_arange(ctx, tmp_arange3_tensor, start, stop, step, n_elements_arange,
1550+
dst);
1551+
1552+
// arange3 * mk
1553+
int64_t tmp_output_ne[] = {ne0, 1, ne2, ne3};
1554+
size_t tmp_output_nb[GGML_MAX_DIMS];
1555+
tmp_output_nb[0] = ggml_type_size(dst->type);
1556+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
1557+
tmp_output_nb[i] = tmp_output_nb[i-1] * tmp_output_ne[i-1];
1558+
}
1559+
void* tmp_output_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
1560+
aclTensor* tmp_output_tensor = create_acl_tensor(tmp_output_buffer,
1561+
type_mapping(dst->type),
1562+
ggml_type_size(dst->type),
1563+
tmp_output_ne,
1564+
tmp_output_nb,
1565+
GGML_MAX_DIMS,
1566+
ACL_FORMAT_ND);
1567+
aclnn_noinplcace_mul(ctx, tmp_arange3_tensor, tmp_mk_tensor,
1568+
tmp_output_tensor, dst);
1569+
1570+
// add
1571+
aclTensor* acl_src = create_acl_tensor(src);
1572+
aclTensor* acl_dst = create_acl_tensor(dst);
1573+
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst, dst);
1574+
1575+
ACL_CHECK(aclDestroyTensor(acl_src));
1576+
ACL_CHECK(aclDestroyTensor(acl_dst));
1577+
ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
1578+
ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
1579+
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1580+
ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
1581+
ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
1582+
ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
1583+
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
1584+
ACL_CHECK(aclDestroyTensor(tmp_arange3_tensor));
1585+
ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
13341586
}

ggml-cann/aclnn_ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
6464

6565
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
6666

67+
void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);
68+
6769
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
6870
aclTensor*, uint64_t*, aclOpExecutor**),
6971
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>

0 commit comments

Comments
 (0)