kv-cache : fix find_slot() logic for free slots

ggerganov · ggerganov · commit 23e1e5493718 · 2025-05-26T14:32:14.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -590,6 +590,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
 
         bool found = true;
         for (uint32_t i = 0; i < n_tokens; i++) {
+            const llama_pos    pos    = ubatch.pos[i];
             const llama_seq_id seq_id = ubatch.seq_id[i][0];
 
             // can we use this cell? either:
@@ -598,10 +599,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
             const bool can_use =
                     cells.is_empty(head_cur + i) ||
                     (
-                        cells.pos_get(head_cur + i) <= ubatch.pos[i] && // causal mask
-                        cells.seq_has(head_cur + i, seq_id) &&          // sequence mask
-                        cells.seq_count(head_cur + i) == 1 &&
-                        is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
+                        cells.seq_has  (head_cur + i, seq_id) && // sequence mask
+                        cells.seq_count(head_cur + i) == 1    &&
+                        (
+                            cells.pos_get  (head_cur + i) >= pos ||                                // causal mask
+                            is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
+                        )
                     );
 
             if (!can_use) {
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -33,7 +33,6 @@ struct llama_kv_cache : public llama_memory_i {
     // process any pending defrag/shift/etc. operations
     // optionally call once before processing a new batch
     // return true if any operations were performed
-    // will reserve a new worst-case graph if needed
     virtual bool update(llama_context & lctx) = 0;
 
     // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
@@ -244,7 +243,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
 
 // utilizes two instances of llama_kv_cache_unified
 //   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
-//   upon successful processing of the batch, the SWA cache removes old tokens outside the n_swa window
 
 class llama_kv_cache_unified_iswa : public llama_kv_cache {
 public: