kv-cache : improve slot allocation logic

ggerganov · ggerganov · commit c1434b819c54 · 2025-05-25T11:16:01.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -554,16 +554,18 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
 
         bool found = true;
         for (uint32_t i = 0; i < n_tokens; i++) {
-            // TODO: improve to accept cells that are masked by the SWA
-            //if (!cells.is_empty(head_cur + i)) {
-
             const llama_seq_id seq_id = ubatch.seq_id[i][0];
 
+            // can we use this cell? either:
+            //  - the cell is empty
+            //  - the cell is occupied only by the same sequence, and the sequence is not masked
             const bool can_use =
                     cells.is_empty(head_cur + i) ||
                     (
-                        cells.seq_has(head_cur + i, seq_id) && // TODO: seq_has_only
-                        is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id])
+                        cells.pos_get(head_cur + i) <= ubatch.pos[i] && // causal mask
+                        cells.seq_has(head_cur + i, seq_id) &&          // sequence mask
+                        cells.seq_count(head_cur + i) == 1 &&
+                        is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
                     );
 
             if (!can_use) {
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -155,6 +155,13 @@ class llama_kv_cells_unified {
         return false;
     }
 
+    int seq_count(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return seq[i].count();
+    }
+
     bool seq_has(uint32_t i, llama_seq_id seq_id) const {
         assert(i < pos.size());
         assert(seq_id >= 0);