ggml-org
diff --git a/‎include/llama.h
Lines changed: 3 additions & 3 deletions b/‎include/llama.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/llama-batch.cpp
Lines changed: 14 additions & 0 deletions b/‎src/llama-batch.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/llama-batch.h
Lines changed: 9 additions & 3 deletions b/‎src/llama-batch.h
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/llama-context.cpp
Lines changed: 25 additions & 36 deletions b/‎src/llama-context.cpp
Lines changed: 25 additions & 36 deletions
@@ -259,9 +259,9 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
         llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
+        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
 
     enum llama_model_kv_override_type {
 
@@ -4,6 +4,15 @@
 #include <cstring>
 #include <algorithm>
 
+void llama_ubatch::update() {
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        const llama_seq_id s = seq_id[i][0];
+
+        seq_pos_min[s] = seq_pos_min[s] == -1 ? pos[i] : std::min(seq_pos_min[s], pos[i]);
+        seq_pos_max[s] = seq_pos_max[s] == -1 ? pos[i] : std::max(seq_pos_max[s], pos[i]);
+    }
+}
+
 llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
     // clear empty sequences
     // the previous ubatch is assumed to be gone,
@@ -26,6 +35,8 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
         /*n_tokens     =*/ 0,
         /*n_seq_tokens =*/ 0,
         /*n_seqs       =*/ 0,
+        /*seq_pos_min  =*/ {-1},
+        /*seq_pos_max  =*/ {-1},
         /*token        =*/ !has_embd ? ubatch_token.data() : nullptr,
         /*embd         =*/ has_embd  ? ubatch_embd.data()  : nullptr,
         /*pos          =*/ ubatch_pos.data(),
@@ -148,6 +159,7 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
         GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
         add_seq_to_ubatch(ubatch, s, length);
     }
+    ubatch.update();
     return ubatch;
 }
 
@@ -175,6 +187,7 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
             if (length + n_tokens_in_ubatch > n_ubatch) { break; }
         }
     }
+    ubatch.update();
     return ubatch;
 }
 
@@ -187,6 +200,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
         GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
         add_seq_to_ubatch(ubatch, s, length);
     }
+    ubatch.update();
     return ubatch;
 }
 
 
@@ -1,25 +1,31 @@
 #pragma once
 
 #include "llama.h"
+#include "llama-cparams.h"
 
 #include <array>
 #include <vector>
 
 // very similar to llama_batch,
 // but has more metadata about sequences
 struct llama_ubatch {
+    void update();
+
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
 
-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
     uint32_t n_seq_tokens; // tokens per sequence
     uint32_t n_seqs;
 
+    llama_pos seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES]; // min position of each sequence
+    llama_pos seq_pos_max[LLAMA_MAX_PARALLEL_SEQUENCES]; // max position of each sequence
+
     llama_token  *  token;    // [n_tokens]
     float        *  embd;     // [n_embd, n_tokens]
     llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
+    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
+    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
     int8_t       *  output;   // [n_tokens]
 };
 
 
@@ -285,7 +285,7 @@ llama_context::llama_context(
 
         // reserve pp graph first so that buffers are only allocated once
         {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, {-1}, {-1}, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
             // max number of outputs
             n_outputs = ubatch_pp.n_tokens;
@@ -305,7 +305,7 @@ llama_context::llama_context(
 
         // reserve with tg graph to get the number of splits and nodes
         {
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, {-1}, {-1}, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
             n_outputs = ubatch_tg.n_tokens;
 
@@ -324,7 +324,7 @@ llama_context::llama_context(
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, {-1}, {-1}, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
             n_outputs = ubatch_pp.n_tokens;
 
@@ -472,7 +472,7 @@ void llama_context::kv_self_update() {
         kv_self->set_full();
 
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, {-1}, {-1}, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
         auto * gf = graph_init();
         graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
@@ -731,8 +731,6 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     n_outputs = n_tokens;
 
-    //batch_manager->prepare(ubatch);
-
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
@@ -883,8 +881,6 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
 
-    llama_kv_cache_guard kv_guard(kv_self);
-
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     if (batch.token) {
@@ -924,21 +920,24 @@ int llama_context::decode(llama_batch & inp_batch) {
         n_outputs_all = 1;
     }
 
-    llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
+    // handle any pending defrags/shifts
+    kv_self_update();
+
+    auto decode_state = kv_self->init(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+    if (!decode_state) {
+        return 1;
+    }
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
     };
 
-    // handle any pending defrags/shifts
-    kv_self_update();
-
     int64_t n_outputs_prev = 0;
 
-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+    while (const auto * ubatch_ptr = decode_state->next()) {
+        const auto & ubatch = *ubatch_ptr;
 
         // count the outputs in this u_batch
         {
@@ -957,11 +956,6 @@ int llama_context::decode(llama_batch & inp_batch) {
             n_outputs = n_outputs_new;
         }
 
-        // find KV slot
-        if (!kv_self->find_slot(ubatch)) {
-            return 1;
-        }
-
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
@@ -1072,17 +1066,14 @@ int llama_context::decode(llama_batch & inp_batch) {
         n_outputs_prev += n_outputs;
     }
 
-    // finalize the batch processing
-    kv_guard.commit();
-
     // set to total number of outputs in the batch, for use in llama_get_logits_ith
     n_outputs = n_outputs_all;
 
     // set output mappings
     {
         bool sorted_output = true;
 
-        auto & out_ids = sbatch.out_ids;
+        auto & out_ids = decode_state->out_ids();
 
         GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
 
@@ -1939,7 +1930,6 @@ void llama_context::opt_epoch_iter(
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
     kv_self->clear();
-    llama_kv_cache_guard kv_guard(kv_self);
 
     for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
         batch.n_tokens = n_batch;
@@ -1962,25 +1952,26 @@ void llama_context::opt_epoch_iter(
 
         int64_t n_outputs_all = n_tokens_all;
 
-        llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+        //llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+        auto decode_state = kv_self->init(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+        if (!decode_state) {
+            LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
+            break;
+        }
 
         // reserve output buffer
         if (output_reserve(n_outputs_all) < n_outputs_all) {
             LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
             GGML_ABORT("TODO: handle this error");
         };
 
-        for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) {
-            llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+        uint32_t pos_batch = 0;
+        while (const auto * ubatch_ptr = decode_state->next()) {
+            const auto & ubatch = *ubatch_ptr;
 
-            n_outputs = ubatch.n_tokens;
+            pos_batch += ubatch.n_tokens;
 
-            // TODO: not sure if this is needed
-            if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
-            }
+            n_outputs = ubatch.n_tokens;
 
             auto * gf = graph_init();
             auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
@@ -2017,8 +2008,6 @@ void llama_context::opt_epoch_iter(
             ggml_free(ctx_compute_opt);
         }
     }
-
-    kv_guard.commit();
 }
 
 void llama_context::opt_epoch(