Skip to content

Commit a4ae386

Browse files
wangshuai09hipudding
authored andcommitted
add timestep_embedding
1 parent 136775e commit a4ae386

File tree

3 files changed

+285
-26
lines changed

3 files changed

+285
-26
lines changed

ggml-cann.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
410410
ggml_cann_arange(ctx, dst);
411411
break;
412412
case GGML_OP_TIMESTEP_EMBEDDING:
413-
return false;
413+
ggml_cann_timestep_embedding(ctx, dst);
414+
break;
414415
case GGML_OP_LEAKY_RELU:
415416
ggml_cann_leaky_relu(ctx, dst);
416417
break;
@@ -696,9 +697,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
696697
case GGML_OP_UPSCALE:
697698
case GGML_OP_PAD:
698699
case GGML_OP_ARANGE:
699-
return true;
700700
case GGML_OP_TIMESTEP_EMBEDDING:
701-
return false;
702701
case GGML_OP_LEAKY_RELU:
703702
return true;
704703
default:

ggml-cann/aclnn_ops.cpp

Lines changed: 281 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
#include <aclnnop/aclnn_cast.h>
55
#include <aclnnop/aclnn_constant_pad_nd.h>
66
#include <aclnnop/aclnn_copy.h>
7+
#include <aclnnop/aclnn_cos.h>
8+
#include <aclnnop/aclnn_exp.h>
79
#include <aclnnop/aclnn_group_norm.h>
810
#include <aclnnop/aclnn_layer_norm.h>
911
#include <aclnnop/aclnn_max_pool.h>
1012
#include <aclnnop/aclnn_permute.h>
1113
#include <aclnnop/aclnn_reduce_sum.h>
1214
#include <aclnnop/aclnn_repeat.h>
15+
#include <aclnnop/aclnn_sin.h>
1316
#include <aclnnop/aclnn_softmax.h>
1417
#include <aclnnop/aclnn_tril.h>
1518
#include <aclnnop/aclnn_triu.h>
@@ -132,12 +135,9 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
132135
ACL_CHECK(aclDestroyTensor(acl_dst));
133136
}
134137

135-
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
136-
ggml_tensor* src0 = dst->src[0];
137-
ggml_tensor* src1 = dst->src[1];
138-
aclTensor* acl_src0 = create_acl_tensor(src0);
139-
aclTensor* acl_src1 = create_acl_tensor(src1);
140-
aclTensor* acl_dst = create_acl_tensor(dst);
138+
void aclnn_concat(ggml_backend_cann_context& ctx, aclTensor *acl_src0,
139+
aclTensor *acl_src1, aclTensor *acl_dst, int64_t concat_dim,
140+
ggml_tensor* bind_tensor) {
141141

142142
std::vector<aclTensor*> tmp{acl_src0, acl_src1};
143143
aclTensorList* tensorList = aclCreateTensorList(tmp.data(), tmp.size());
@@ -146,33 +146,44 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
146146
aclOpExecutor* executor;
147147
void* workspaceAddr = nullptr;
148148

149-
// dim1 == ne2, dims in llama.cpp is reversed.
150-
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, 1, acl_dst, &workspaceSize,
151-
&executor));
149+
// dims in llama.cpp is reversed.
150+
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
151+
&workspaceSize, &executor));
152152
if (workspaceSize > 0) {
153-
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
153+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
154154
}
155155

156156
aclrtStream main_stream = ctx.stream();
157157
ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream));
158158

159-
aclDestroyTensorList(tensorList);
160-
ACL_CHECK(aclDestroyTensor(acl_dst));
159+
ACL_CHECK(aclDestroyTensorList(tensorList));
161160
}
162161

163-
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
164-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
162+
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
163+
ggml_tensor* src0 = dst->src[0];
164+
ggml_tensor* src1 = dst->src[1];
165+
aclTensor* acl_src0 = create_acl_tensor(src0);
166+
aclTensor* acl_src1 = create_acl_tensor(src1);
167+
aclTensor* acl_dst = create_acl_tensor(dst);
165168

166-
float start;
167-
float stop;
168-
float step;
169-
memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
170-
memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
171-
memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
169+
int64_t concat_dim = 1;
170+
171+
aclnn_concat(ctx, acl_src0, acl_src1, acl_dst, concat_dim, dst);
172+
173+
// release acl_src0, acl_src1 in aclnn_concat
174+
// ACL_CHECK(aclDestroyTensor(acl_src0));
175+
// ACL_CHECK(aclDestroyTensor(acl_src1));
176+
// ->
177+
// ACL_CHECK(aclDestroyTensorList(tensorList));
178+
ACL_CHECK(aclDestroyTensor(acl_dst));
179+
}
180+
181+
void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor *acl_dst,
182+
float start, float stop, float step, int64_t n_elements,
183+
ggml_tensor* bind_tensor) {
172184

173185
int64_t steps = (int64_t)std::ceil((stop - start) / step);
174-
GGML_ASSERT(ggml_nelements(dst) == steps);
175-
aclTensor* acl_dst = create_acl_tensor(dst);
186+
GGML_ASSERT(n_elements == steps);
176187

177188
uint64_t workspaceSize = 0;
178189
aclOpExecutor* executor;
@@ -185,7 +196,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
185196
ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
186197
&workspaceSize, &executor));
187198
if (workspaceSize > 0) {
188-
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
199+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
189200
}
190201

191202
aclrtStream main_stream = ctx.stream();
@@ -194,6 +205,22 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
194205
ACL_CHECK(aclDestroyScalar(acl_start));
195206
ACL_CHECK(aclDestroyScalar(acl_end));
196207
ACL_CHECK(aclDestroyScalar(acl_step));
208+
}
209+
210+
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
211+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
212+
213+
aclTensor* acl_dst = create_acl_tensor(dst);
214+
215+
int64_t n_elements = ggml_nelements(dst);
216+
float start;
217+
float stop;
218+
float step;
219+
memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
220+
memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
221+
memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
222+
223+
aclnn_arange(ctx, acl_dst, start, stop, step, n_elements, dst);
197224
ACL_CHECK(aclDestroyTensor(acl_dst));
198225
}
199226

@@ -582,7 +609,9 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
582609
0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
583610
0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
584611
aclnn_pad(ctx, dst, acl_src, acl_dst, paddings);
612+
585613
ACL_CHECK(aclDestroyTensor(acl_dst));
614+
ACL_CHECK(aclDestroyTensor(acl_src));
586615
}
587616

588617
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1078,3 +1107,232 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10781107
ACL_CHECK(aclDestroyIntArray(paddings));
10791108
ACL_CHECK(aclDestroyIntArray(strides));
10801109
}
1110+
1111+
void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1112+
ggml_tensor* bind_tensor) {
1113+
1114+
uint64_t workspaceSize = 0;
1115+
aclOpExecutor* executor;
1116+
void* workspaceAddr = nullptr;
1117+
1118+
ACL_CHECK(aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize,
1119+
&executor));
1120+
if(workspaceSize > 0) {
1121+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1122+
}
1123+
1124+
ACL_CHECK(aclnnInplaceExp(workspaceAddr, workspaceSize, executor,
1125+
ctx.stream()));
1126+
1127+
}
1128+
1129+
void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor *acl_src, float scale,
1130+
ggml_tensor* bind_tensor) {
1131+
1132+
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1133+
1134+
uint64_t workspaceSize = 0;
1135+
aclOpExecutor* executor;
1136+
void* workspaceAddr = nullptr;
1137+
1138+
ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
1139+
&workspaceSize, &executor));
1140+
if (workspaceSize > 0) {
1141+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1142+
}
1143+
1144+
ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
1145+
ctx.stream()));
1146+
1147+
ACL_CHECK(aclDestroyScalar(acl_scale));
1148+
}
1149+
1150+
void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1151+
aclTensor *acl_other, ggml_tensor* bind_tensor) {
1152+
1153+
uint64_t workspaceSize = 0;
1154+
aclOpExecutor* executor;
1155+
void* workspaceAddr = nullptr;
1156+
1157+
ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
1158+
&workspaceSize, &executor));
1159+
if (workspaceSize > 0) {
1160+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1161+
}
1162+
1163+
ACL_CHECK(aclnnInplaceMul(workspaceAddr, workspaceSize, executor,
1164+
ctx.stream()));
1165+
}
1166+
1167+
void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1168+
aclTensor *acl_other, aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1169+
1170+
uint64_t workspaceSize = 0;
1171+
aclOpExecutor* executor;
1172+
void* workspaceAddr = nullptr;
1173+
1174+
ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
1175+
&workspaceSize, &executor));
1176+
if (workspaceSize > 0) {
1177+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1178+
}
1179+
1180+
ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor,
1181+
ctx.stream()));
1182+
}
1183+
1184+
void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1185+
aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1186+
1187+
uint64_t workspaceSize = 0;
1188+
aclOpExecutor* executor;
1189+
void* workspaceAddr = nullptr;
1190+
1191+
ACL_CHECK(aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize,
1192+
&executor));
1193+
if (workspaceSize > 0) {
1194+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1195+
}
1196+
1197+
ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor,
1198+
ctx.stream()));
1199+
}
1200+
1201+
void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor *acl_src,
1202+
aclTensor *acl_dst, ggml_tensor* bind_tensor) {
1203+
1204+
uint64_t workspaceSize = 0;
1205+
aclOpExecutor* executor;
1206+
void* workspaceAddr = nullptr;
1207+
1208+
ACL_CHECK(aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize,
1209+
&executor));
1210+
if (workspaceSize > 0) {
1211+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
1212+
}
1213+
1214+
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor,
1215+
ctx.stream()));
1216+
}
1217+
1218+
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1219+
const ggml_tensor* src = dst->src[0];
1220+
1221+
GGML_ASSERT(src->type == GGML_TYPE_F32);
1222+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
1223+
1224+
const int dim = dst->op_params[0];
1225+
const int max_period = dst->op_params[1];
1226+
int half = dim / 2;
1227+
1228+
aclTensor* acl_src = create_acl_tensor(src);
1229+
1230+
// arange: [0, ..., half)
1231+
float start = 0;
1232+
float stop = half;
1233+
float step = 1;
1234+
int64_t n_elements_arange = half;
1235+
int64_t tmp_arange_ne[] = {half};
1236+
size_t tmp_arange_nb[] = {sizeof(dst->type)};
1237+
1238+
void* tmp_arange_buffer = ctx.alloc_buffer(dst, half * sizeof(dst->type));
1239+
aclTensor* tmp_arange_tensor = create_acl_tensor(tmp_arange_buffer,
1240+
type_mapping(dst->type),
1241+
ggml_type_size(dst->type),
1242+
tmp_arange_ne,
1243+
tmp_arange_nb,
1244+
GGML_MAX_DIMS-3,
1245+
ACL_FORMAT_ND);
1246+
1247+
aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange,
1248+
dst);
1249+
1250+
// freq
1251+
float freq_param = -logf(max_period) / half;
1252+
aclnn_muls(ctx, tmp_arange_tensor, freq_param, dst);
1253+
aclnn_exp(ctx, tmp_arange_tensor, dst);
1254+
1255+
// permute: src [0,1,2,3]->[0,1,3,2]
1256+
int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
1257+
size_t tmp_permute_nb[GGML_MAX_DIMS];
1258+
tmp_permute_nb[0] = ggml_type_size(src->type);
1259+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
1260+
tmp_permute_nb[i] = tmp_permute_nb[i-1] * tmp_permute_ne[i-1];
1261+
}
1262+
1263+
void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src)*320);
1264+
aclTensor* tmp_permute_tenosr = create_acl_tensor(tmp_permute_buffer,
1265+
type_mapping(src->type),
1266+
ggml_type_size(src->type),
1267+
tmp_permute_ne,
1268+
tmp_permute_nb,
1269+
GGML_MAX_DIMS,
1270+
ACL_FORMAT_ND);
1271+
int64_t permute_dim[] = {0, 1, 3, 2};
1272+
int64_t num_dims = 4;
1273+
aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims, dst);
1274+
1275+
// timestep * freq
1276+
int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
1277+
src->ne[3]};
1278+
size_t tmp_mul_nb[GGML_MAX_DIMS];
1279+
tmp_mul_nb[0] = ggml_type_size(src->type);
1280+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
1281+
tmp_mul_nb[i] = tmp_mul_nb[i-1] * tmp_mul_ne[i-1];
1282+
}
1283+
1284+
int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] *
1285+
src->ne[3];
1286+
1287+
void* tmp_mul_buffer = ctx.alloc_buffer(dst, mul_nelements *
1288+
ggml_type_size(src->type));
1289+
aclTensor* tmp_mul_tensor = create_acl_tensor(tmp_mul_buffer,
1290+
type_mapping(src->type),
1291+
ggml_type_size(src->type),
1292+
tmp_mul_ne,
1293+
tmp_mul_nb,
1294+
GGML_MAX_DIMS,
1295+
ACL_FORMAT_ND);
1296+
aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor,
1297+
tmp_mul_tensor, dst);
1298+
1299+
// cos
1300+
void* tmp_cos_buffer = ctx.alloc_buffer(dst, mul_nelements *
1301+
ggml_type_size(src->type));
1302+
aclTensor* tmp_cos_tensor = create_acl_tensor(tmp_cos_buffer,
1303+
type_mapping(dst->type),
1304+
ggml_type_size(dst->type),
1305+
tmp_mul_ne,
1306+
tmp_mul_nb,
1307+
GGML_MAX_DIMS,
1308+
ACL_FORMAT_ND);
1309+
1310+
aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor, dst);
1311+
1312+
// sin
1313+
void* tmp_sin_buffer = ctx.alloc_buffer(dst, mul_nelements *
1314+
ggml_type_size(src->type));
1315+
aclTensor* tmp_sin_tensor = create_acl_tensor(tmp_sin_buffer,
1316+
type_mapping(dst->type),
1317+
ggml_type_size(dst->type),
1318+
tmp_mul_ne,
1319+
tmp_mul_nb,
1320+
GGML_MAX_DIMS,
1321+
ACL_FORMAT_ND);
1322+
1323+
aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor, dst);
1324+
1325+
// concat
1326+
int64_t concat_dim = 3;
1327+
aclTensor* acl_dst = create_acl_tensor(dst);
1328+
aclnn_concat(ctx, tmp_cos_tensor, tmp_sin_tensor, acl_dst, concat_dim, dst);
1329+
1330+
// release
1331+
ACL_CHECK(aclDestroyTensor(acl_src));
1332+
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1333+
ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
1334+
ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1335+
ACL_CHECK(aclDestroyTensor(tmp_cos_tensor));
1336+
ACL_CHECK(aclDestroyTensor(tmp_sin_tensor));
1337+
ACL_CHECK(aclDestroyTensor(acl_dst));
1338+
}

ggml-cann/aclnn_ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
6262

6363
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
6464

65+
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
66+
6567
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
6668
aclTensor*, uint64_t*, aclOpExecutor**),
6769
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>

0 commit comments

Comments
 (0)