server : fix cache_tokens not getting correctly resized

compilade · compilade · commit 7cd5a1f986e9 · 2024-03-07T14:18:21.000-05:00
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1797,9 +1797,6 @@ struct server_context {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
 
-                                // remove the non-common part from the cache
-                                slot.cache_tokens.resize(slot.n_past);
-
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
@@ -1846,11 +1843,13 @@ struct server_context {
                         slot.n_past = 0;
                         slot.n_past_se = 0;
                         slot.ga_i = 0;
-                        slot.cache_tokens.clear();
                         // TODO: is the system prompt ever in the sampling context?
                         llama_sampling_reset(slot.ctx_sampling);
                     }
 
+                    // remove the non-common part from the cache
+                    slot.cache_tokens.resize(slot.n_past);
+
                     LOG_INFO("kv cache rm [p0, end)", {
                         { "id_slot", slot.id },
                         { "id_task", slot.id_task },