@@ -2746,6 +2746,7 @@ static struct ggml_cgraph * llm_build_llama(
2746
2746
ggml_set_name (cur, " attention_norm_0" );
2747
2747
}
2748
2748
2749
+ // shift the entire K-cache if needed
2749
2750
if (do_rope_shift) {
2750
2751
ggml_build_forward_expand (gf,
2751
2752
ggml_rope_custom_inplace (ctx0,
@@ -2987,6 +2988,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2987
2988
const int32_t n_tokens = batch.n_tokens ;
2988
2989
const int32_t n_kv = llama_kv_cache_cell_max (kv_self);
2989
2990
2991
+ const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure (lctx.alloc );
2992
+
2990
2993
auto & buf_compute = lctx.buf_compute ;
2991
2994
2992
2995
struct ggml_init_params params = {
@@ -3090,6 +3093,16 @@ static struct ggml_cgraph * llm_build_baichaun(
3090
3093
}
3091
3094
}
3092
3095
3096
+ // K_shift
3097
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_ctx);
3098
+ ggml_allocr_alloc (lctx.alloc , K_shift);
3099
+ if (!ggml_allocr_is_measure (lctx.alloc )) {
3100
+ int * data = (int *) K_shift->data ;
3101
+ for (int i = 0 ; i < n_ctx; ++i) {
3102
+ data[i] = kv_self.cells [i].delta ;
3103
+ }
3104
+ }
3105
+
3093
3106
for (int il = 0 ; il < n_layer; ++il) {
3094
3107
ggml_format_name (inpL, " layer_inp_%d" , il);
3095
3108
@@ -3115,6 +3128,18 @@ static struct ggml_cgraph * llm_build_baichaun(
3115
3128
ggml_set_name (cur, " attention_norm_0" );
3116
3129
}
3117
3130
3131
+ // shift the entire K-cache if needed
3132
+ if (do_rope_shift) {
3133
+ ggml_build_forward_expand (gf,
3134
+ ggml_rope_custom_inplace (ctx0,
3135
+ ggml_view_3d (ctx0, kv_self.k ,
3136
+ n_embd_head, n_head_kv, n_ctx,
3137
+ ggml_element_size (kv_self.k )*n_embd_head,
3138
+ ggml_element_size (kv_self.k )*n_embd_gqa,
3139
+ ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il),
3140
+ K_shift, n_embd_head, 0 , 0 , freq_base, freq_scale));
3141
+ }
3142
+
3118
3143
// self-attention
3119
3144
{
3120
3145
// compute Q and K and RoPE them
@@ -3362,6 +3387,8 @@ static struct ggml_cgraph * llm_build_falcon(
3362
3387
const int32_t n_tokens = batch.n_tokens ;
3363
3388
const int32_t n_kv = llama_kv_cache_cell_max (kv_self);
3364
3389
3390
+ const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure (lctx.alloc );
3391
+
3365
3392
auto & buf_compute = lctx.buf_compute ;
3366
3393
3367
3394
struct ggml_init_params params = {
@@ -3465,6 +3492,16 @@ static struct ggml_cgraph * llm_build_falcon(
3465
3492
}
3466
3493
}
3467
3494
3495
+ // K_shift
3496
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_ctx);
3497
+ ggml_allocr_alloc (lctx.alloc , K_shift);
3498
+ if (!ggml_allocr_is_measure (lctx.alloc )) {
3499
+ int * data = (int *) K_shift->data ;
3500
+ for (int i = 0 ; i < n_ctx; ++i) {
3501
+ data[i] = kv_self.cells [i].delta ;
3502
+ }
3503
+ }
3504
+
3468
3505
for (int il = 0 ; il < n_layer; ++il) {
3469
3506
struct ggml_tensor * attn_norm;
3470
3507
@@ -3476,6 +3513,18 @@ static struct ggml_cgraph * llm_build_falcon(
3476
3513
}
3477
3514
#endif // GGML_USE_CUBLAS
3478
3515
3516
+ // shift the entire K-cache if needed
3517
+ if (do_rope_shift) {
3518
+ ggml_build_forward_expand (gf,
3519
+ ggml_rope_custom_inplace (ctx0,
3520
+ ggml_view_3d (ctx0, kv_self.k ,
3521
+ n_embd_head, n_head_kv, n_ctx,
3522
+ ggml_element_size (kv_self.k )*n_embd_head,
3523
+ ggml_element_size (kv_self.k )*n_embd_gqa,
3524
+ ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il),
3525
+ K_shift, n_embd_head, 2 , 0 , freq_base, freq_scale));
3526
+ }
3527
+
3479
3528
// self-attention
3480
3529
// TODO: refactor into common function (shared with LLaMA)
3481
3530
{
0 commit comments