Skip to content

Commit 7cd5a1f

Browse files
committed
server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case was triggered, an extra token was kept in cache_tokens even if it was removed from the KV cache. For Mamba, this caused useless prompt reprocessing when the previous request triggered the above case.
1 parent 916b586 commit 7cd5a1f

File tree

1 file changed

+3
-4
lines changed

1 file changed

+3
-4
lines changed

examples/server/server.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1797,9 +1797,6 @@ struct server_context {
17971797
// reuse any previously computed tokens that are common with the new prompt
17981798
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
17991799

1800-
// remove the non-common part from the cache
1801-
slot.cache_tokens.resize(slot.n_past);
1802-
18031800
// push the prompt into the sampling context (do not apply grammar)
18041801
for (int i = 0; i < slot.n_past; ++i) {
18051802
llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
@@ -1846,11 +1843,13 @@ struct server_context {
18461843
slot.n_past = 0;
18471844
slot.n_past_se = 0;
18481845
slot.ga_i = 0;
1849-
slot.cache_tokens.clear();
18501846
// TODO: is the system prompt ever in the sampling context?
18511847
llama_sampling_reset(slot.ctx_sampling);
18521848
}
18531849

1850+
// remove the non-common part from the cache
1851+
slot.cache_tokens.resize(slot.n_past);
1852+
18541853
LOG_INFO("kv cache rm [p0, end)", {
18551854
{ "id_slot", slot.id },
18561855
{ "id_task", slot.id_task },

0 commit comments

Comments
 (0)