server : upon full re-processing, remove the sequence from the cache

ggerganov · ggerganov · commit 3cf518635607 · 2025-05-31T10:04:50.000+03:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3219,6 +3219,7 @@ struct server_context {
                                     SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
                                     SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
                                             "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                    llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                     slot.n_past = 0;
                                 }
                             }

Original file line number	Diff line number	Diff line change
`@@ -3219,6 +3219,7 @@ struct server_context {`
`3219`	`3219`	`SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);`
`3220`	`3220`	`SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",`
`3221`	`3221`	`"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");`
	`3222`	`+ llama_kv_self_seq_rm(ctx, slot.id, 0, -1);`
`3222`	`3223`	`slot.n_past = 0;`
`3223`	`3224`	`}`
`3224`	`3225`	`}`