llama : use n_swa + n_ubatch cells for SWA cache + auto-batch

ggerganov · ggerganov · commit 1bce7e8d8df1 · 2025-05-27T20:11:34.000+03:00
ggml-ci
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -164,6 +164,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    params.n_batch = params.n_ctx;
+
     common_init();
 
     // number of simultaneous "clients" to simulate
@@ -356,59 +358,23 @@ int main(int argc, char ** argv) {
             break;
         }
 
-        // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
-
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            // experiment: process in powers of 2
-            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
-            //    n_batch /= 2;
-            //    i -= n_batch;
-            //    continue;
-            //}
-
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                if (n_batch == 1 || ret < 0) {
-                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
-                    return 1;
-                }
-
-                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
-
-                n_cache_miss += 1;
-
-                // retry with half the batch size to try to find a free slot in the KV cache
-                n_batch /= 2;
-                i -= n_batch;
-
-                continue;
+        {
+            if (const auto ret = llama_decode(ctx, batch) != 0) {
+                LOG_ERR("%s : failed to decode the batch, n_tokens = %d, ret = %d\n", __func__, batch.n_tokens, ret);
+                return 1;
             }
 
-            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, batch.n_tokens);
 
             for (auto & client : clients) {
-                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
+                if (client.seq_id == -1) {
                     continue;
                 }
 
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch);
 
                 common_sampler_accept(client.smpl, id, true);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -420,9 +420,9 @@ const llama_kv_cache * llama_context::get_kv_self() const {
     return kv_self;
 }
 
-void llama_context::kv_self_update() {
+bool llama_context::kv_self_update() {
     if (!memory) {
-        return;
+        return false;
     }
 
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
@@ -438,7 +438,11 @@ void llama_context::kv_self_update() {
         if (!gf) {
             LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
         }
+
+        return true;
     }
+
+    return false;
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
@@ -891,25 +895,53 @@ int llama_context::decode(llama_batch & inp_batch) {
     // handle any pending defrags/shifts
     kv_self_update();
 
-    auto decode_state = kv_self->init(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
-    if (!decode_state) {
-        return -2;
-    }
+    llama_memory_decode_state_ptr decode_state;
 
-    switch (decode_state->get_status()) {
-        case LLAMA_MEMORY_STATUS_SUCCESS:
-            {
-            } break;
-        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
-            {
-                // not a fatal error, we can re-try with a different batch
-                return 1;
-            }
-        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
-            {
-                return -2;
-            }
-    }
+    bool did_defrag = false;
+    auto n_ubatch = cparams.n_ubatch;
+
+    do {
+        decode_state = kv_self->init(batch, n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+        if (!decode_state) {
+            return -2;
+        }
+
+        switch (decode_state->get_status()) {
+            case LLAMA_MEMORY_STATUS_SUCCESS:
+                {
+                } break;
+            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+                {
+                    if (!did_defrag) {
+                        did_defrag = true;
+
+                        kv_self->defrag_sched(-1.0f);
+                        if (kv_self_update()) {
+                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
+
+                            continue;
+                        }
+                    }
+
+                    if (n_ubatch > 1) {
+                        n_ubatch /= 2;
+
+                        LLAMA_LOG_DEBUG("%s: failed to find free space in the KV cache, retrying with smaller ubatch size: n_ubatch = %d\n", __func__, n_ubatch);
+                        continue;
+                    }
+
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+                    return 1;
+                }
+            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+                {
+                    return -2;
+                }
+        }
+
+        break;
+    } while(true);
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -2588,22 +2620,8 @@ int32_t llama_encode(
 int32_t llama_decode(
         llama_context * ctx,
           llama_batch   batch) {
-    int ret = ctx->decode(batch);
-
-    // defrag and try again
-    // TODO: distinguish return code when we are sure that even after defrag there is no space available
-    if (ret == 1) {
-        llama_kv_self_defrag(ctx);
-        ret = ctx->decode(batch);
-
-        if (ret == 1) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
-
-            return ret;
-        }
-    }
-
-    if (ret != 0) {
+    const int ret = ctx->decode(batch);
+    if (ret != 0 && ret != 1) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -47,8 +47,9 @@ struct llama_context {
           llama_kv_cache * get_kv_self();
     const llama_kv_cache * get_kv_self() const;
 
+    // return true of the KV cache was updated
     // TODO: remove
-    void kv_self_update();
+    bool kv_self_update();
 
     enum llama_pooling_type pooling_type() const;
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1738,14 +1738,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                      bool   swa_full,
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
-                 uint32_t   n_batch,
+                 uint32_t   n_ubatch,
                  uint32_t   n_pad) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
     const uint32_t size_base = kv_size;
 
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
 
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
     if (swa_full) {
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -251,7 +251,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
                          bool   swa_full,
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
-                     uint32_t   n_batch,
+                     uint32_t   n_ubatch,
                      uint32_t   n_pad);
 
     ~llama_kv_cache_unified_iswa() = default;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13234,7 +13234,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             params.swa_full,
                             cparams.n_ctx,
                             cparams.n_seq_max,
-                            cparams.n_batch,
+                            cparams.n_ubatch,
                             padding);
                 } else {
                     GGML_ASSERT(!hparams.is_swa_any());
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
@@ -21,6 +21,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    params.n_batch = params.n_ctx;
+
     common_init();
 
     int is_pp_shared = params.is_pp_shared;
@@ -61,48 +63,21 @@ int main(int argc, char ** argv) {
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
-    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-
-            llama_synchronize(ctx);
-        }
-
-        return true;
-    };
-
     // warm up
     {
         for (int i = 0; i < 16; ++i) {
             common_batch_add(batch, 0, i, { 0 }, false);
         }
 
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+        if (const auto ret = llama_decode(ctx, batch)) {
+            LOG_ERR("%s: llama_decode() failed, ret = %d\n", __func__, ret);
             return 1;
         }
     }
 
     if (!params.batched_bench_output_jsonl) {
         LOG("\n");
-        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("%s: n_kv_max = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
         LOG("\n");
         LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
         LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
@@ -134,9 +109,11 @@ int main(int argc, char ** argv) {
 
                 llama_kv_self_clear(ctx);
 
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
-                    return 1;
+                if (batch.n_tokens > 0) {
+                    if (const auto ret = llama_decode(ctx, batch) != 0) {
+                        LOG_ERR("%s: llama_decode() failed, ret = %d\n", __func__, ret);
+                        return 1;
+                    }
                 }
 
                 if (is_pp_shared) {
@@ -156,8 +133,8 @@ int main(int argc, char ** argv) {
                         common_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    if (const auto ret = llama_decode(ctx, batch) != 0) {
+                        LOG_ERR("%s: llama_decode() failed, ret = %d\n", __func__, ret);
                         return 1;
                     }
                 }
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
@@ -856,7 +856,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     double acc = 0.0f;
 
     const int n_ctx   = llama_n_ctx(ctx);
-    const int n_batch = params.n_batch;
+    const int n_batch = n_ctx;
 
     const int n_vocab = llama_vocab_n_tokens(vocab);
 
@@ -1154,7 +1154,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
     const int n_ctx   = llama_n_ctx(ctx);
-    const int n_batch = params.n_batch;
+    const int n_batch = n_ctx;
 
     const int n_vocab = llama_vocab_n_tokens(vocab);
 
@@ -1508,7 +1508,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     LOG("\ntask\tacc_norm\n");
 
     const int n_ctx   = llama_n_ctx(ctx);
-    const int n_batch = params.n_batch;
+    const int n_batch = n_ctx;
 
     const int n_vocab = llama_vocab_n_tokens(vocab);
 
@@ -1732,7 +1732,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         return;
     }
 
-    const int n_batch = params.n_batch;
+    const int n_batch = params.n_ctx;
     const int num_batches = (n_ctx + n_batch - 1)/n_batch;
     const int nv = 2*((n_vocab + 1)/2) + 4;
     const bool add_bos = llama_vocab_get_add_bos(vocab);
@@ -1982,12 +1982,13 @@ int main(int argc, char ** argv) {
     common_init();
 
     const int32_t n_ctx = params.n_ctx;
-
     if (n_ctx <= 0) {
         LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
         return 1;
     }
 
+    params.n_batch = n_ctx;
+
     const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
 
     if (ppl) {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp