@@ -7773,6 +7773,7 @@ static int llama_decode_impl(
7773
7773
llama_batch_allocr batch_allocr (inp_batch, inp_batch.pos ? -1 : lctx.pos_max () + 1 );
7774
7774
7775
7775
const llama_batch & batch = batch_allocr.batch ;
7776
+
7776
7777
const uint32_t n_tokens_all = batch.n_tokens ;
7777
7778
7778
7779
const auto & model = lctx.model ;
@@ -7800,9 +7801,6 @@ static int llama_decode_impl(
7800
7801
}
7801
7802
lctx.n_queued_tokens += n_tokens_all;
7802
7803
7803
- auto & kv_self = lctx.kv_self ;
7804
- llama_kv_slot_restorer kv_slot_restorer (kv_self);
7805
-
7806
7804
const int64_t n_embd = hparams.n_embd ;
7807
7805
const int64_t n_vocab = vocab.n_tokens ();
7808
7806
@@ -7828,16 +7826,19 @@ static int llama_decode_impl(
7828
7826
n_outputs = 1 ;
7829
7827
}
7830
7828
7831
- lctx.sbatch .from_batch (batch, n_embd,
7832
- /* simple_split */ !kv_self.recurrent ,
7833
- /* logits_all */ n_outputs == n_tokens_all);
7834
-
7835
7829
// reserve output buffer
7836
7830
if (llama_output_reserve (lctx, n_outputs) < n_outputs) {
7837
7831
LLAMA_LOG_ERROR (" %s: could not reserve space for batch with %u outputs\n " , __func__, n_outputs);
7838
7832
return -2 ;
7839
7833
};
7840
7834
7835
+ auto & kv_self = lctx.kv_self ;
7836
+ llama_kv_slot_restorer kv_slot_restorer (kv_self);
7837
+
7838
+ lctx.sbatch .from_batch (batch, n_embd,
7839
+ /* simple_split */ !kv_self.recurrent ,
7840
+ /* logits_all */ n_outputs == n_tokens_all);
7841
+
7841
7842
while (lctx.sbatch .n_tokens > 0 ) {
7842
7843
llama_ubatch ubatch;
7843
7844
if (kv_self.recurrent ) {
@@ -8635,7 +8636,6 @@ struct llama_context * llama_init_from_model(
8635
8636
cparams.rope_freq_base = params.rope_freq_base == 0 .0f ? hparams.rope_freq_base_train : params.rope_freq_base ;
8636
8637
cparams.rope_freq_scale = params.rope_freq_scale == 0 .0f ? hparams.rope_freq_scale_train : params.rope_freq_scale ;
8637
8638
8638
- // this is necessary due to kv_self.n being padded later during inference
8639
8639
cparams.n_ctx = GGML_PAD (cparams.n_ctx , ctx->get_ctx_padding (cparams));
8640
8640
8641
8641
// with causal attention, the batch size is limited by the context size
0 commit comments