From f547c4f54a1f70cc660d194bab4d2b5414be8a7f Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Sun, 12 May 2024 12:06:36 +0200
Subject: [PATCH 1/2] Update server.cpp

---
 examples/server/server.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ceaeb1f76dc3d..32ceab1fbf8b2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2029,7 +2029,24 @@ struct server_context {
 
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-
+                                if (slot.ga_n != 1)
+                                {
+                                    int ga_i = 0;
+                                    int32_t ga_n = slot.ga_n;
+                                    int32_t ga_w = slot.ga_w;
+                                    int32_t slot_npast = 0;
+                                    for (int k = 0; k < slot.n_past; ++k)
+                                    {
+                                        while (slot_npast >= ga_i + ga_w) {
+                                            const int bd = (ga_w/ga_n)*(ga_n - 1);
+                                            slot_npast -= bd;
+                                            ga_i += ga_w/ga_n;
+                                        }
+                                        slot_npast++;
+                                    }
+                                    slot.n_past_se = slot_npast;
+                                    slot.ga_i = ga_i;
+                                }
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);

From f4f5b7ac560de66be4e875210f8c3679ef4b3dac Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Sun, 12 May 2024 16:27:32 +0200
Subject: [PATCH 2/2] Removed changes

---
 examples/server/server.cpp | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 32ceab1fbf8b2..86c6312e4d2eb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2029,24 +2029,7 @@ struct server_context {
 
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-                                if (slot.ga_n != 1)
-                                {
-                                    int ga_i = 0;
-                                    int32_t ga_n = slot.ga_n;
-                                    int32_t ga_w = slot.ga_w;
-                                    int32_t slot_npast = 0;
-                                    for (int k = 0; k < slot.n_past; ++k)
-                                    {
-                                        while (slot_npast >= ga_i + ga_w) {
-                                            const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                            slot_npast -= bd;
-                                            ga_i += ga_w/ga_n;
-                                        }
-                                        slot_npast++;
-                                    }
-                                    slot.n_past_se = slot_npast;
-                                    slot.ga_i = ga_i;
-                                }
+                                
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);