From f547c4f54a1f70cc660d194bab4d2b5414be8a7f Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Sun, 12 May 2024 12:06:36 +0200 Subject: [PATCH 1/2] Update server.cpp --- examples/server/server.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ceaeb1f76dc3d..32ceab1fbf8b2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2029,7 +2029,24 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - + if (slot.ga_n != 1) + { + int ga_i = 0; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; + int32_t slot_npast = 0; + for (int k = 0; k < slot.n_past; ++k) + { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + slot_npast++; + } + slot.n_past_se = slot_npast; + slot.ga_i = ga_i; + } // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); From f4f5b7ac560de66be4e875210f8c3679ef4b3dac Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Sun, 12 May 2024 16:27:32 +0200 Subject: [PATCH 2/2] Removed changes --- examples/server/server.cpp | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 32ceab1fbf8b2..86c6312e4d2eb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2029,24 +2029,7 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - if (slot.ga_n != 1) - { - int ga_i = 0; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - int32_t slot_npast = 0; - for (int k = 0; k < slot.n_past; ++k) - { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - slot_npast++; - } - slot.n_past_se = slot_npast; - slot.ga_i = ga_i; - } + // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);