9
9
#include < cmath>
10
10
#include < cstring>
11
11
12
- static int32_t llama_relative_position_bucket (llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
13
- // TODO move to hparams if a T5 variant appears that uses a different value
14
- const int64_t max_distance = 128 ;
15
-
16
- if (bidirectional) {
17
- n_buckets >>= 1 ;
18
- }
19
-
20
- const int64_t max_exact = n_buckets >> 1 ;
21
-
22
- int32_t relative_position = x - y;
23
- int32_t relative_bucket = 0 ;
24
-
25
- if (bidirectional) {
26
- relative_bucket += (relative_position > 0 ) * n_buckets;
27
- relative_position = abs (relative_position);
28
- } else {
29
- relative_position = -std::min<int32_t >(relative_position, 0 );
30
- }
31
-
32
- int32_t relative_position_if_large = floorf (max_exact + logf (1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log (1.0 * max_distance / max_exact));
33
- relative_position_if_large = std::min<int32_t >(relative_position_if_large, n_buckets - 1 );
34
- relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
35
-
36
- return relative_bucket;
37
- }
38
-
39
12
void llm_graph_input_embd::set_input (const llama_ubatch * ubatch) {
40
13
if (ubatch->token ) {
41
14
const int64_t n_tokens = ubatch->n_tokens ;
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
110
83
111
84
void llm_graph_input_pos_bucket_kv::set_input (const llama_ubatch * ubatch) {
112
85
if (pos_bucket) {
113
- const int64_t n_tokens = ubatch->n_tokens ;
114
-
115
- GGML_ASSERT (ggml_backend_buffer_is_host (pos_bucket->buffer ));
116
- GGML_ASSERT (!ubatch->equal_seqs ); // TODO: use ubatch->n_seqs instead of failing
117
-
118
- int32_t * data = (int32_t *) pos_bucket->data ;
119
-
120
- const int64_t n_kv = kv_self->n ;
121
-
122
- for (int h = 0 ; h < 1 ; ++h) {
123
- for (int j = 0 ; j < n_tokens; ++j) {
124
- for (int i = 0 ; i < n_kv; ++i) {
125
- data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket (kv_self->cells [i].pos , ubatch->pos [j], hparams.n_rel_attn_bkts , false );
126
- }
127
- }
128
- }
86
+ kv_self->set_input_pos_bucket (pos_bucket, ubatch);
129
87
}
130
88
}
131
89
@@ -403,99 +361,12 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
403
361
}
404
362
405
363
void llm_graph_input_attn_kv_unified::set_input (const llama_ubatch * ubatch) {
406
- if (self_kq_mask || self_kq_mask_swa) {
407
- const int64_t n_kv = kv_self->n ;
408
- const int64_t n_tokens = ubatch->n_tokens ;
409
- const int64_t n_seq_tokens = ubatch->n_seq_tokens ;
410
- const int64_t n_seqs = ubatch->n_seqs ;
411
-
412
- float * data = nullptr ;
413
- float * data_swa = nullptr ;
414
-
415
- if (self_kq_mask) {
416
- GGML_ASSERT (ggml_backend_buffer_is_host (self_kq_mask->buffer ));
417
- data = (float *) self_kq_mask->data ;
418
- }
419
-
420
- if (self_kq_mask_swa) {
421
- GGML_ASSERT (ggml_backend_buffer_is_host (self_kq_mask_swa->buffer ));
422
- data_swa = (float *) self_kq_mask_swa->data ;
423
- }
424
-
425
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
426
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
427
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
428
- // Causal mask:
429
- // xxx-------
430
- // xxxx------
431
- // xxxxx-----
432
- // Non-causal mask:
433
- // xxxxx-----
434
- // xxxxx-----
435
- // xxxxx-----
436
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
437
- for (int h = 0 ; h < 1 ; ++h) {
438
- for (int s = 0 ; s < n_seqs; ++s) {
439
- const llama_seq_id seq_id = ubatch->seq_id [s][0 ];
440
-
441
- for (int j = 0 ; j < n_seq_tokens; ++j) {
442
- const llama_pos pos = ubatch->pos [s*n_seq_tokens + j];
443
- for (int i = 0 ; i < n_kv; ++i) {
444
- float f;
445
- // mask the token if:
446
- if (!kv_self->cells [i].has_seq_id (seq_id) // not the correct sequence
447
- || (cparams.causal_attn && kv_self->cells [i].pos > pos) // for causal, mask future tokens
448
- ) {
449
- f = -INFINITY;
450
- } else {
451
- if (hparams.use_alibi ) {
452
- f = -std::abs (kv_self->cells [i].pos - pos);
453
- } else {
454
- f = 0 .0f ;
455
- }
456
- }
457
-
458
- if (data) {
459
- data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
460
- }
461
-
462
- // may need to cut off old tokens for sliding window
463
- // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
464
- if (data_swa) {
465
- if (hparams.n_attn_chunk ) {
466
- llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk ) * hparams.n_attn_chunk ;
467
- if (kv_self->cells [i].pos < pos_chunk_start || pos < pos_chunk_start) {
468
- f = -INFINITY;
469
- }
470
- } else {
471
- if (pos - kv_self->cells [i].pos >= (int32_t )hparams.n_swa ) {
472
- f = -INFINITY;
473
- }
474
- }
475
- data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
476
- }
477
- }
478
- }
479
- }
480
-
481
- // mask padded tokens
482
- if (data) {
483
- for (int i = n_tokens; i < GGML_PAD (n_tokens, GGML_KQ_MASK_PAD); ++i) {
484
- for (int j = 0 ; j < n_kv; ++j) {
485
- data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
486
- }
487
- }
488
- }
364
+ if (self_kq_mask) {
365
+ kv_self->set_input_kq_mask (self_kq_mask, ubatch, cparams.causal_attn );
366
+ }
489
367
490
- // mask padded tokens
491
- if (data_swa) {
492
- for (int i = n_tokens; i < GGML_PAD (n_tokens, GGML_KQ_MASK_PAD); ++i) {
493
- for (int j = 0 ; j < n_kv; ++j) {
494
- data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
495
- }
496
- }
497
- }
498
- }
368
+ if (self_kq_mask_swa) {
369
+ kv_self->set_input_kq_mask_swa (self_kq_mask_swa, ubatch, cparams.causal_attn );
499
370
}
500
371
}
501
372
@@ -1152,7 +1023,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
1152
1023
1153
1024
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
1154
1025
1155
- const auto n_kv = kv_self->n ;
1026
+ const auto n_kv = kv_self->n_base () ;
1156
1027
1157
1028
auto & cur = inp->pos_bucket ;
1158
1029
@@ -1357,17 +1228,21 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
1357
1228
1358
1229
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
1359
1230
1360
- const auto n_kv = kv_self->n ;
1231
+ {
1232
+ const auto n_kv = kv_self->n_base ();
1361
1233
1362
- inp->self_kq_mask = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD (n_tokens, GGML_KQ_MASK_PAD));
1363
- // cb(inp->self_kq_mask, "KQ_mask", -1);
1364
- ggml_set_input (inp->self_kq_mask );
1234
+ inp->self_kq_mask = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD (n_tokens, GGML_KQ_MASK_PAD));
1235
+ // cb(inp->self_kq_mask, "KQ_mask", -1);
1236
+ ggml_set_input (inp->self_kq_mask );
1365
1237
1366
- inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask , GGML_TYPE_F16) : inp->self_kq_mask ;
1238
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask , GGML_TYPE_F16) : inp->self_kq_mask ;
1239
+ }
1367
1240
1368
1241
if (hparams.n_swa_pattern > 1 ) {
1369
1242
GGML_ASSERT (hparams.n_swa > 0 );
1370
1243
1244
+ const auto n_kv = kv_self->n_swa ();
1245
+
1371
1246
inp->self_kq_mask_swa = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD (n_tokens, GGML_KQ_MASK_PAD));
1372
1247
// cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
1373
1248
ggml_set_input (inp->self_kq_mask_swa );
@@ -1397,6 +1272,9 @@ ggml_tensor * llm_graph_context::build_attn(
1397
1272
ggml_build_forward_expand (gf, v_cur);
1398
1273
1399
1274
const llama_kv_cache_unified * kv_self = static_cast <const llama_kv_cache_unified *>(memory);
1275
+
1276
+ const auto & kv_layer = kv_self->get_layer (il);
1277
+
1400
1278
const auto & n_ctx = cparams.n_ctx ;
1401
1279
1402
1280
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa (il);
@@ -1408,11 +1286,11 @@ ggml_tensor * llm_graph_context::build_attn(
1408
1286
1409
1287
// store to KV cache
1410
1288
{
1411
- const auto kv_head = kv_self ->head ;
1289
+ const auto kv_head = kv_layer. cells ->head ;
1412
1290
1413
- GGML_ASSERT (kv_self ->size == n_ctx);
1291
+ GGML_ASSERT (kv_layer. cells ->size == n_ctx);
1414
1292
1415
- ggml_tensor * k_cache_view = ggml_view_1d (ctx0, kv_self-> k_l [il] , n_tokens*n_embd_k_gqa, ggml_row_size (kv_self-> k_l [il] ->type , n_embd_k_gqa)*kv_head);
1293
+ ggml_tensor * k_cache_view = ggml_view_1d (ctx0, kv_layer. k , n_tokens*n_embd_k_gqa, ggml_row_size (kv_layer. k ->type , n_embd_k_gqa)*kv_head);
1416
1294
// cb(k_cache_view, "k_cache_view", il);
1417
1295
1418
1296
// note: storing RoPE-ed version of K in the KV cache
@@ -1423,12 +1301,12 @@ ggml_tensor * llm_graph_context::build_attn(
1423
1301
ggml_tensor * v_cache_view = nullptr ;
1424
1302
1425
1303
if (!v_trans) {
1426
- v_cache_view = ggml_view_1d (ctx0, kv_self-> v_l [il] , n_tokens*n_embd_v_gqa, ggml_row_size (kv_self-> v_l [il] ->type , n_embd_v_gqa)*kv_head);
1304
+ v_cache_view = ggml_view_1d (ctx0, kv_layer. v , n_tokens*n_embd_v_gqa, ggml_row_size (kv_layer. v ->type , n_embd_v_gqa)*kv_head);
1427
1305
} else {
1428
1306
// note: the V cache is transposed when not using flash attention
1429
- v_cache_view = ggml_view_2d (ctx0, kv_self-> v_l [il] , n_tokens, n_embd_v_gqa,
1430
- ( n_ctx)*ggml_element_size (kv_self-> v_l [il] ),
1431
- (kv_head)*ggml_element_size (kv_self-> v_l [il] ));
1307
+ v_cache_view = ggml_view_2d (ctx0, kv_layer. v , n_tokens, n_embd_v_gqa,
1308
+ ( n_ctx)*ggml_element_size (kv_layer. v ),
1309
+ (kv_head)*ggml_element_size (kv_layer. v ));
1432
1310
1433
1311
v_cur = ggml_transpose (ctx0, v_cur);
1434
1312
}
@@ -1438,12 +1316,11 @@ ggml_tensor * llm_graph_context::build_attn(
1438
1316
}
1439
1317
1440
1318
const bool is_swa = hparams.is_swa (il);
1319
+ const int64_t n_head_kv = hparams.n_head_kv (il);
1441
1320
1442
1321
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa () : inp->get_kq_mask ();
1443
1322
1444
- const auto n_kv = kv_self->n ;
1445
-
1446
- const int64_t n_head_kv = hparams.n_head_kv (il);
1323
+ const auto n_kv = kv_layer.cells ->n ;
1447
1324
1448
1325
const auto & n_embd_head_k = hparams.n_embd_head_k ;
1449
1326
const auto & n_embd_head_v = hparams.n_embd_head_v ;
@@ -1452,23 +1329,23 @@ ggml_tensor * llm_graph_context::build_attn(
1452
1329
// cb(q, "q", il);
1453
1330
1454
1331
ggml_tensor * k =
1455
- ggml_view_3d (ctx0, kv_self-> k_l [il] ,
1332
+ ggml_view_3d (ctx0, kv_layer. k ,
1456
1333
n_embd_head_k, n_kv, n_head_kv,
1457
- ggml_row_size (kv_self-> k_l [il] ->type , n_embd_k_gqa),
1458
- ggml_row_size (kv_self-> k_l [il] ->type , n_embd_head_k),
1334
+ ggml_row_size (kv_layer. k ->type , n_embd_k_gqa),
1335
+ ggml_row_size (kv_layer. k ->type , n_embd_head_k),
1459
1336
0 );
1460
1337
// cb(k, "k", il);
1461
1338
1462
1339
ggml_tensor * v = !v_trans ?
1463
- ggml_view_3d (ctx0, kv_self-> v_l [il] ,
1340
+ ggml_view_3d (ctx0, kv_layer. v ,
1464
1341
n_embd_head_v, n_kv, n_head_kv,
1465
- ggml_row_size (kv_self-> v_l [il] ->type , n_embd_v_gqa),
1466
- ggml_row_size (kv_self-> v_l [il] ->type , n_embd_head_v),
1342
+ ggml_row_size (kv_layer. v ->type , n_embd_v_gqa),
1343
+ ggml_row_size (kv_layer. v ->type , n_embd_head_v),
1467
1344
0 ) :
1468
- ggml_view_3d (ctx0, kv_self-> v_l [il] ,
1345
+ ggml_view_3d (ctx0, kv_layer. v ,
1469
1346
n_kv, n_embd_head_v, n_head_kv,
1470
- ggml_element_size (kv_self-> v_l [il] )*n_ctx,
1471
- ggml_element_size (kv_self-> v_l [il] )*n_ctx*n_embd_head_v,
1347
+ ggml_element_size (kv_layer. v )*n_ctx,
1348
+ ggml_element_size (kv_layer. v )*n_ctx*n_embd_head_v,
1472
1349
0 );
1473
1350
1474
1351
ggml_tensor * cur = build_attn_mha (gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
@@ -1700,3 +1577,30 @@ void llm_graph_context::build_pooling(
1700
1577
1701
1578
ggml_build_forward_expand (gf, cur);
1702
1579
}
1580
+
1581
+ int32_t llama_relative_position_bucket (llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
1582
+ // TODO move to hparams if a T5 variant appears that uses a different value
1583
+ const int64_t max_distance = 128 ;
1584
+
1585
+ if (bidirectional) {
1586
+ n_buckets >>= 1 ;
1587
+ }
1588
+
1589
+ const int64_t max_exact = n_buckets >> 1 ;
1590
+
1591
+ int32_t relative_position = x - y;
1592
+ int32_t relative_bucket = 0 ;
1593
+
1594
+ if (bidirectional) {
1595
+ relative_bucket += (relative_position > 0 ) * n_buckets;
1596
+ relative_position = abs (relative_position);
1597
+ } else {
1598
+ relative_position = -std::min<int32_t >(relative_position, 0 );
1599
+ }
1600
+
1601
+ int32_t relative_position_if_large = floorf (max_exact + logf (1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log (1.0 * max_distance / max_exact));
1602
+ relative_position_if_large = std::min<int32_t >(relative_position_if_large, n_buckets - 1 );
1603
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
1604
+
1605
+ return relative_bucket;
1606
+ }
0 commit comments