Skip to content

Commit 295354e

Browse files
authored
llama : fix K-shift with quantized K and BLAS backend (#13113)
1 parent 558a764 commit 295354e

File tree

2 files changed

+4
-16
lines changed

2 files changed

+4
-16
lines changed

src/llama-context.cpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
469469
ggml_tensor * shift,
470470
ggml_tensor * factors,
471471
float freq_base,
472-
float freq_scale,
473-
ggml_backend_buffer * bbuf) const {
472+
float freq_scale) const {
474473
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
475474

476475
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
492491
// dequantize to f32 -> RoPE -> quantize back
493492
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
494493

495-
if (bbuf) {
496-
for (const auto & backend : backends) {
497-
// Figure out which backend KV cache belongs to
498-
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
499-
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
500-
break;
501-
}
502-
}
503-
}
504-
505-
tmp = ggml_rope_ext_inplace(ctx0, tmp,
494+
tmp = ggml_rope_ext(ctx0, tmp,
506495
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
507496
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
508497

@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
582571
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
583572
0);
584573

585-
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
574+
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
586575

587576
ggml_build_forward_expand(gf, cur);
588577
}

src/llama-context.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,7 @@ struct llama_context {
170170
ggml_tensor * shift,
171171
ggml_tensor * factors,
172172
float freq_base,
173-
float freq_scale,
174-
ggml_backend_buffer * bbuf) const;
173+
float freq_scale) const;
175174

176175
llm_graph_result_ptr build_kv_self_shift(
177176
ggml_context * ctx0,

0 commit comments

Comments
 (0)