Skip to content

Commit 23e1e54

Browse files
committed
kv-cache : fix find_slot() logic for free slots
ggml-ci
1 parent 9fbab1c commit 23e1e54

File tree

2 files changed

+7
-6
lines changed

2 files changed

+7
-6
lines changed

src/llama-kv-cache.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
590590

591591
bool found = true;
592592
for (uint32_t i = 0; i < n_tokens; i++) {
593+
const llama_pos pos = ubatch.pos[i];
593594
const llama_seq_id seq_id = ubatch.seq_id[i][0];
594595

595596
// can we use this cell? either:
@@ -598,10 +599,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
598599
const bool can_use =
599600
cells.is_empty(head_cur + i) ||
600601
(
601-
cells.pos_get(head_cur + i) <= ubatch.pos[i] && // causal mask
602-
cells.seq_has(head_cur + i, seq_id) && // sequence mask
603-
cells.seq_count(head_cur + i) == 1 &&
604-
is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
602+
cells.seq_has (head_cur + i, seq_id) && // sequence mask
603+
cells.seq_count(head_cur + i) == 1 &&
604+
(
605+
cells.pos_get (head_cur + i) >= pos || // causal mask
606+
is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
607+
)
605608
);
606609

607610
if (!can_use) {

src/llama-kv-cache.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ struct llama_kv_cache : public llama_memory_i {
3333
// process any pending defrag/shift/etc. operations
3434
// optionally call once before processing a new batch
3535
// return true if any operations were performed
36-
// will reserve a new worst-case graph if needed
3736
virtual bool update(llama_context & lctx) = 0;
3837

3938
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
@@ -244,7 +243,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
244243

245244
// utilizes two instances of llama_kv_cache_unified
246245
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
247-
// upon successful processing of the batch, the SWA cache removes old tokens outside the n_swa window
248246

249247
class llama_kv_cache_unified_iswa : public llama_kv_cache {
250248
public:

0 commit comments

Comments
 (0)